From 1cf1830b0f97f0c517862c8a2030e2fc4d76108e Mon Sep 17 00:00:00 2001 From: Marcin Koziej Date: Wed, 12 Oct 2011 16:49:28 +0200 Subject: [PATCH 1/1] basic query using Dublin core fields works --- apps/search/__init__.py | 4 +- apps/search/index.py | 107 +++++++++++++++++- apps/search/tests/__init__.py | 1 + apps/search/tests/files/fraszka-do-anusie.xml | 49 ++++++++ apps/search/tests/files/fraszki.xml | 27 +++++ apps/search/tests/index.py | 31 +++++ 6 files changed, 212 insertions(+), 7 deletions(-) create mode 100644 apps/search/tests/__init__.py create mode 100755 apps/search/tests/files/fraszka-do-anusie.xml create mode 100755 apps/search/tests/files/fraszki.xml create mode 100644 apps/search/tests/index.py diff --git a/apps/search/__init__.py b/apps/search/__init__.py index 8b1378917..f45c150f1 100644 --- a/apps/search/__init__.py +++ b/apps/search/__init__.py @@ -1 +1,3 @@ - +from index import Index, Search +import lucene +lucene.initVM(lucene.CLASSPATH) diff --git a/apps/search/index.py b/apps/search/index.py index 8f7722a77..94e6f099c 100644 --- a/apps/search/index.py +++ b/apps/search/index.py @@ -1,12 +1,107 @@ from django.conf import settings -from lucene import SimpleFSDirectory, IndexWriter +from lucene import SimpleFSDirectory, IndexWriter, File, Field, NumericField, PolishAnalyzer, \ + Version, Document, JavaError, IndexSearcher, QueryParser, Term import os +import errno +from librarian import dcparser +from catalogue.models import Book -class BookSearch(object): +class IndexStore(object): def __init__(self): - if not os.exists(settings.SEARCH_INDEX): - os.mkdir(settings.SEARCH_INDEX) - self.store = IndexWriter(store, ) - + self.make_index_dir() + self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX)) + + def make_index_dir(self): + try: + os.makedirs(settings.SEARCH_INDEX) + except OSError as exc: + if exc.errno == errno.EEXIST: + pass + else: raise + + +class Index(IndexStore): + def __init__(self): + IndexStore.__init__(self) + self.index = None + + def open(self, analyzer=None): + if not analyzer: + analyzer = PolishAnalyzer(Version.LUCENE_34) + if self.index: + raise Exception("Index is already opened") + self.index = IndexWriter(self.store, analyzer, IndexWriter.MaxFieldLength.LIMITED) + return self.index + + def close(self): + self.index.optimize() + self.index.close() + + def index_book(self, book, overwrite=True): + book_info = dcparser.parse(book.xml_file) + + if overwrite: + self.index.deleteDocuments(Term("id", str(book.id))) + + doc = Document() + doc.add(NumericField("id", Field.Store.YES, True).setIntValue(book.id)) + doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)) + + # validator, name + for field in dcparser.BookInfo.FIELDS: + if hasattr(book_info, field.name): + if not getattr(book_info, field.name): + continue + # since no type information is available, we use validator + type_indicator = field.validator + if type_indicator == dcparser.as_unicode: + s = getattr(book_info, field.name) + if field.multiple: + s = ', '.join(s) + try: + doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)) + except JavaError as je: + raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args)) + elif type_indicator == dcparser.as_person: + p = getattr(book_info, field.name) + if isinstance(p, dcparser.Person): + persons = str(p) + else: + persons = ', '.join(map(str, p)) + doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)) + elif type_indicator == dcparser.as_date: + dt = getattr(book_info, field.name) + doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)) + + self.index.addDocument(doc) + + def __enter__(self): + self.open() + return self + + def __exit__(self, type, value, tb): + self.close() + + +class Search(IndexStore): + def __init__(self, default_field="description"): + IndexStore.__init__(self) + self.analyzer = PolishAnalyzer(Version.LUCENE_34) + self.searcher = IndexSearcher(self.store, True) + self.parser = QueryParser(Version.LUCENE_34, default_field, self.analyzer) + + def query(self, query): + return self.parser.parse(query) + + def search(self, query, max_results=50): + """Returns (books, total_hits) + """ + + tops = self.searcher.search(self.query(query), max_results) + bks = [] + for found in tops.scoreDocs: + doc = self.searcher.doc(found.doc) + bks.append(Book.objects.get(id=doc.get("id"))) + return (bks, tops.totalHits) diff --git a/apps/search/tests/__init__.py b/apps/search/tests/__init__.py new file mode 100644 index 000000000..403c290f0 --- /dev/null +++ b/apps/search/tests/__init__.py @@ -0,0 +1 @@ +from search.tests.index import * diff --git a/apps/search/tests/files/fraszka-do-anusie.xml b/apps/search/tests/files/fraszka-do-anusie.xml new file mode 100755 index 000000000..3bbda155e --- /dev/null +++ b/apps/search/tests/files/fraszka-do-anusie.xml @@ -0,0 +1,49 @@ + + + + +Sęp Szarzyński, Mikołaj +Fraszka do Anusie +Sekuła, Aleksandra +Sutkowska, Olga +Fundacja Nowoczesna Polska +Barok +Liryka +Fraszka +Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. +http://wolnelektury.pl/katalog/lektura/fraszka-do-anusie +http://www.polona.pl/Content/8759 +Szarzyński Sęp, Mikołaj (ca 1550-1581), Rytmy abo Wiersze polskie w wyborze, E. Wende, Warszawa, 1914 +Domena publiczna - Mikołaj Sęp Szarzyński zm. 1581 +1581 +xml +text +text +2008-12-29 +L +L +pol + + + + +Mikołaj Sęp Szarzyński + +Fraszka do Anusie + + + +Kochanek, Łzy, Miłość, Oko, Serce, WzrokJeśli oczu hamować swoich nie umiały/ +Leśnych krynic boginie, aby nie płakały,/ +Gdy baczyłybaczyły --- tu: zobaczyły, patrzyły na. przy studni Narcyza pięknego,/ +A on umarł prze miłość oblicza swojego;/ +Jeśli nieśmiertelnym stanom żałość rozkazuje,/ +Gdy niebaczna fortuna co niesłusznie psuje: + +Jakoż ja mam hamować, by na lice moje/ +Z oczu smutnych żałośne nie płynęły zdroje?/ +Jako serce powściągać, aby nie wzdychało/ +I od ciężkiej żałości omdlewać nie miało? + + + diff --git a/apps/search/tests/files/fraszki.xml b/apps/search/tests/files/fraszki.xml new file mode 100755 index 000000000..edb29abbc --- /dev/null +++ b/apps/search/tests/files/fraszki.xml @@ -0,0 +1,27 @@ + + + + +Kochanowski, Jan +Fraszki +http://wolnelektury.pl/katalog/lektura/fraszka-do-anusie + +Fundacja Nowoczesna Polska +Renesans +Liryka +Fraszka + + +http://wolnelektury.pl/lektura/fraszki + +Domena publiczna - Jan Kochanowski zm. 1584 +1584 +xml +text + +text +2008-11-12 +pol + + + diff --git a/apps/search/tests/index.py b/apps/search/tests/index.py new file mode 100644 index 000000000..ed02c2354 --- /dev/null +++ b/apps/search/tests/index.py @@ -0,0 +1,31 @@ +from __future__ import with_statement + +from search import Index, Search +from catalogue import models +from catalogue.test_utils import WLTestCase +#from nose.tools import raises +from os import path + + +class BookSearchTests(WLTestCase): + def setUp(self): + WLTestCase.setUp(self) + + txt = path.join(path.dirname(__file__), 'files/fraszka-do-anusie.xml') + self.book = models.Book.from_xml_file(txt) + + search = Index() + with search: + search.index_book(self.book) + print "index: %s" % search + + def test_search(self): + search = Search() + bks,_= search.search("wolne") + self.assertEqual(len(bks), 1) + self.assertEqual(bks[0].id, 1) + + bks,_= search.search("technical_editors: sutkowska") + self.assertEqual(len(bks), 1) + self.assertEqual(bks[0].id, 1) + -- 2.20.1