From: Jan Szejko Date: Fri, 26 Jan 2018 12:01:34 +0000 (+0100) Subject: disable crawling for catalogue pages with multiple tags X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/2e45f239552a9e378a9e1f65274dfb7c13df220d?ds=sidebyside disable crawling for catalogue pages with multiple tags --- diff --git a/src/catalogue/views.py b/src/catalogue/views.py index bba773d32..5273ab429 100644 --- a/src/catalogue/views.py +++ b/src/catalogue/views.py @@ -6,6 +6,7 @@ from collections import OrderedDict import random from django.conf import settings +from django.http.response import HttpResponseForbidden from django.template import RequestContext from django.template.loader import render_to_string from django.shortcuts import render_to_response, get_object_or_404, render, redirect @@ -27,6 +28,7 @@ from catalogue.helpers import get_top_level_related_tags from catalogue.models import Book, Collection, Tag, Fragment from catalogue.utils import split_tags from catalogue.models.tag import prefetch_relations +from wolnelektury.utils import is_crawler staff_required = user_passes_test(lambda user: user.is_staff) @@ -222,6 +224,9 @@ def tagged_object_list(request, tags, list_type): except ResponseInstead as e: return e.response + if is_crawler(request) and len(tags) > 1: + return HttpResponseForbidden('address removed from crawling. check robots.txt') + if list_type == 'gallery' and any(tag.category == 'set' for tag in tags): raise Http404 diff --git a/src/wolnelektury/utils.py b/src/wolnelektury/utils.py index d20039cd2..2657a5542 100644 --- a/src/wolnelektury/utils.py +++ b/src/wolnelektury/utils.py @@ -155,3 +155,14 @@ class UnicodeCSVWriter(object): # the original re.escape messes with unicode def re_escape(s): return re.sub(r"[(){}\[\].*?|^$\\+-]", r"\\\g<0>", s) + + +BOT_BITS = ['bot', 'slurp', 'spider', 'facebook', 'crawler', 'parser', 'http'] + + +def is_crawler(request): + user_agent = request.META.get('HTTP_USER_AGENT') + if not user_agent: + return True + user_agent = user_agent.lower() + return any(bot_bit in user_agent for bot_bit in BOT_BITS)