From: Marcin Koziej <marcin.koziej@nowoczesnapolska.org.pl> Date: Wed, 12 Oct 2011 14:49:28 +0000 (+0200) Subject: basic query using Dublin core fields works X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/1cf1830b0f97f0c517862c8a2030e2fc4d76108e basic query using Dublin core fields works --- diff --git a/apps/search/__init__.py b/apps/search/__init__.py index 8b1378917..f45c150f1 100644 --- a/apps/search/__init__.py +++ b/apps/search/__init__.py @@ -1 +1,3 @@ - +from index import Index, Search +import lucene +lucene.initVM(lucene.CLASSPATH) diff --git a/apps/search/index.py b/apps/search/index.py index 8f7722a77..94e6f099c 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -1,12 +1,107 @@ from django.conf import settings -from lucene import SimpleFSDirectory, IndexWriter +from lucene import SimpleFSDirectory, IndexWriter, File, Field, NumericField, PolishAnalyzer, \ + Version, Document, JavaError, IndexSearcher, QueryParser, Term import os +import errno +from librarian import dcparser +from catalogue.models import Book -class BookSearch(object): +class IndexStore(object): def __init__(self): - if not os.exists(settings.SEARCH_INDEX): - os.mkdir(settings.SEARCH_INDEX) - self.store = IndexWriter(store, ) - + self.make_index_dir() + self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX)) + + def make_index_dir(self): + try: + os.makedirs(settings.SEARCH_INDEX) + except OSError as exc: + if exc.errno == errno.EEXIST: + pass + else: raise + + +class Index(IndexStore): + def __init__(self): + IndexStore.__init__(self) + self.index = None + + def open(self, analyzer=None): + if not analyzer: + analyzer = PolishAnalyzer(Version.LUCENE_34) + if self.index: + raise Exception("Index is already opened") + self.index = IndexWriter(self.store, analyzer, IndexWriter.MaxFieldLength.LIMITED) + return self.index + + def close(self): + self.index.optimize() + self.index.close() + + def index_book(self, book, overwrite=True): + book_info = dcparser.parse(book.xml_file) + + if overwrite: + self.index.deleteDocuments(Term("id", str(book.id))) + + doc = Document() + doc.add(NumericField("id", Field.Store.YES, True).setIntValue(book.id)) + doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)) + + # validator, name + for field in dcparser.BookInfo.FIELDS: + if hasattr(book_info, field.name): + if not getattr(book_info, field.name): + continue + # since no type information is available, we use validator + type_indicator = field.validator + if type_indicator == dcparser.as_unicode: + s = getattr(book_info, field.name) + if field.multiple: + s = ', '.join(s) + try: + doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)) + except JavaError as je: + raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args)) + elif type_indicator == dcparser.as_person: + p = getattr(book_info, field.name) + if isinstance(p, dcparser.Person): + persons = str(p) + else: + persons = ', '.join(map(str, p)) + doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)) + elif type_indicator == dcparser.as_date: + dt = getattr(book_info, field.name) + doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)) + + self.index.addDocument(doc) + + def __enter__(self): + self.open() + return self + + def __exit__(self, type, value, tb): + self.close() + + +class Search(IndexStore): + def __init__(self, default_field="description"): + IndexStore.__init__(self) + self.analyzer = PolishAnalyzer(Version.LUCENE_34) + self.searcher = IndexSearcher(self.store, True) + self.parser = QueryParser(Version.LUCENE_34, default_field, self.analyzer) + + def query(self, query): + return self.parser.parse(query) + + def search(self, query, max_results=50): + """Returns (books, total_hits) + """ + + tops = self.searcher.search(self.query(query), max_results) + bks = [] + for found in tops.scoreDocs: + doc = self.searcher.doc(found.doc) + bks.append(Book.objects.get(id=doc.get("id"))) + return (bks, tops.totalHits) diff --git a/apps/search/tests/__init__.py b/apps/search/tests/__init__.py new file mode 100644 index 000000000..403c290f0 --- /dev/null +++ b/apps/search/tests/__init__.py @@ -0,0 +1 @@ +from search.tests.index import * diff --git a/apps/search/tests/files/fraszka-do-anusie.xml b/apps/search/tests/files/fraszka-do-anusie.xml new file mode 100755 index 000000000..3bbda155e --- /dev/null +++ b/apps/search/tests/files/fraszka-do-anusie.xml @@ -0,0 +1,49 @@ +<?xml version='1.0' encoding='utf-8'?> +<utwor> + <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/"> +<rdf:Description rdf:about="http://wiki.wolnepodreczniki.pl/index.php?title=Lektury:S%C4%99p-Szarzy%C5%84ski/Rytmy/Fraszka_do_Anusie"> +<dc:creator xml:lang="pl">SÄp SzarzyÅski, MikoÅaj</dc:creator> +<dc:title xml:lang="pl">Fraszka do Anusie</dc:title> +<dc:contributor.editor xml:lang="pl">SekuÅa, Aleksandra</dc:contributor.editor> +<dc:contributor.technical_editor xml:lang="pl">Sutkowska, Olga</dc:contributor.technical_editor> +<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher> +<dc:subject.period xml:lang="pl">Barok</dc:subject.period> +<dc:subject.type xml:lang="pl">Liryka</dc:subject.type> +<dc:subject.genre xml:lang="pl">Fraszka</dc:subject.genre> +<dc:description xml:lang="pl">Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez BibliotekÄ NarodowÄ z egzemplarza pochodzÄ cego ze zbiorów BN.</dc:description> +<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/katalog/lektura/fraszka-do-anusie</dc:identifier.url> +<dc:source.URL xml:lang="pl">http://www.polona.pl/Content/8759</dc:source.URL> +<dc:source xml:lang="pl">SzarzyÅski SÄp, MikoÅaj (ca 1550-1581), Rytmy abo Wiersze polskie w wyborze, E. Wende, Warszawa, 1914</dc:source> +<dc:rights xml:lang="pl">Domena publiczna - MikoÅaj SÄp SzarzyÅski zm. 1581</dc:rights> +<dc:date.pd xml:lang="pl">1581</dc:date.pd> +<dc:format xml:lang="pl">xml</dc:format> +<dc:type xml:lang="pl">text</dc:type> +<dc:type xml:lang="en">text</dc:type> +<dc:date xml:lang="pl">2008-12-29</dc:date> +<dc:audience xml:lang="pl">L</dc:audience> +<dc:audience xml:lang="pl">L</dc:audience> +<dc:language xml:lang="pl">pol</dc:language> +</rdf:Description> +</rdf:RDF> + <liryka_l> + +<autor_utworu>MikoÅaj SÄp SzarzyÅski</autor_utworu> + +<nazwa_utworu>Fraszka do Anusie</nazwa_utworu> + + + +<strofa><begin id="b1230084410751"/><motyw id="m1230084410751">Kochanek, Åzy, MiÅoÅÄ, Oko, Serce, Wzrok</motyw>JeÅli oczu hamowaÄ swoich nie umiaÅy/ +LeÅnych krynic boginie, aby nie pÅakaÅy,/ +Gdy baczyÅy<pe><slowo_obce>baczyÅy</slowo_obce> --- tu: zobaczyÅy, patrzyÅy na.</pe> przy studni Narcyza piÄknego,/ +A on umarÅ prze miÅoÅÄ oblicza swojego;/ +JeÅli nieÅmiertelnym stanom żaÅoÅÄ rozkazuje,/ +Gdy niebaczna fortuna co niesÅusznie psuje:</strofa> + +<strofa>Jakoż ja mam hamowaÄ, by na lice moje/ +Z oczu smutnych żaÅoÅne nie pÅynÄÅy zdroje?/ +Jako serce powÅciÄ gaÄ, aby nie wzdychaÅo/ +I od ciÄżkiej żaÅoÅci omdlewaÄ nie miaÅo?<end id="e1230084410751"/></strofa> + +</liryka_l> +</utwor> diff --git a/apps/search/tests/files/fraszki.xml b/apps/search/tests/files/fraszki.xml new file mode 100755 index 000000000..edb29abbc --- /dev/null +++ b/apps/search/tests/files/fraszki.xml @@ -0,0 +1,27 @@ +<?xml version='1.0' encoding='utf-8'?> +<utwor> +<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/"> +<rdf:Description rdf:about=""> +<dc:creator xml:lang="pl">Kochanowski, Jan</dc:creator> +<dc:title xml:lang="pl">Fraszki</dc:title> +<dc:relation.hasPart xml:lang="pl">http://wolnelektury.pl/katalog/lektura/fraszka-do-anusie</dc:relation.hasPart> + +<dc:publisher xml:lang="pl">Fundacja Nowoczesna Polska</dc:publisher> +<dc:subject.period xml:lang="pl">Renesans</dc:subject.period> +<dc:subject.type xml:lang="pl">Liryka</dc:subject.type> +<dc:subject.genre xml:lang="pl">Fraszka</dc:subject.genre> + +<dc:description xml:lang="pl"></dc:description> +<dc:identifier.url xml:lang="pl">http://wolnelektury.pl/lektura/fraszki</dc:identifier.url> +<dc:source xml:lang="pl"></dc:source> +<dc:rights xml:lang="pl">Domena publiczna - Jan Kochanowski zm. 1584</dc:rights> +<dc:date.pd xml:lang="pl">1584</dc:date.pd> +<dc:format xml:lang="pl">xml</dc:format> +<dc:type xml:lang="pl">text</dc:type> + +<dc:type xml:lang="en">text</dc:type> +<dc:date xml:lang="pl">2008-11-12</dc:date> +<dc:language xml:lang="pl">pol</dc:language> +</rdf:Description> +</rdf:RDF> +</utwor> diff --git a/apps/search/tests/index.py b/apps/search/tests/index.py new file mode 100644 index 000000000..ed02c2354 --- /dev/null +++ b/apps/search/tests/index.py @@ -0,0 +1,31 @@ +from __future__ import with_statement + +from search import Index, Search +from catalogue import models +from catalogue.test_utils import WLTestCase +#from nose.tools import raises +from os import path + + +class BookSearchTests(WLTestCase): + def setUp(self): + WLTestCase.setUp(self) + + txt = path.join(path.dirname(__file__), 'files/fraszka-do-anusie.xml') + self.book = models.Book.from_xml_file(txt) + + search = Index() + with search: + search.index_book(self.book) + print "index: %s" % search + + def test_search(self): + search = Search() + bks,_= search.search("wolne") + self.assertEqual(len(bks), 1) + self.assertEqual(bks[0].id, 1) + + bks,_= search.search("technical_editors: sutkowska") + self.assertEqual(len(bks), 1) + self.assertEqual(bks[0].id, 1) +