footnote_tags = ['pa', 'pt', 'pr', 'pe']
- skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
+ skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
published_date_re = re.compile("([0-9]+)[\]. ]*$")
# get published date
source = book_info.source_name
match = self.published_date_re.search(source)
- print("published date is %s %s" % (match, match is not None and match.groups()))
if match is not None:
fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
fragments = {}
snippets = Snippets(book.id).open('w')
- position = 0
try:
- for header in list(master):
+ for header, position in zip(list(master), range(len(master))):
if header.tag in self.skip_header_tags:
continue
for start, end in walker(header, ignore_tags=self.ignore_content_tags):
# handle footnotes
- if start is not None and start.tag in self.footnote_tags:
- footnote = ' '.join(start.itertext())
- elif end is not None and footnote is not None and end.tag in self.footnote_tags:
- doc = add_part(snippets, header_index=position, header_type=header.tag,
- content=footnote)
+ # if start is not None and start.tag in self.footnote_tags:
+ # footnote = ' '.join(start.itertext())
+ # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
+ # doc = add_part(snippets, header_index=position, header_type=header.tag,
+ # content=footnote)
- self.index.addDocument(doc)
+ # self.index.addDocument(doc)
- footnote = None
+ # footnote = None
# handle fragments and themes.
if start is not None and start.tag == 'begin':
content=fix_format(content))
self.index.addDocument(doc)
- position += 1
finally:
snippets.close()
pd = stored.get("published_date")
if pd is None:
- print "published_date is none for book %d" % self.book_id
pd = 0
self.published_date = int(pd)
raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
self._hits += other._hits
if other.score > self.score:
- self.score = other.score
+ self._score = other._score
return self
def get_book(self):
tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
for theme in themes:
name_tokens = self.search.get_tokens(theme.name, 'POLISH')
- print "THEME HIT: %s in %s" % (tokens, name_tokens)
for t in tokens:
if t in name_tokens:
if not theme in themes_hit:
return q
def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
- filters=None, tokens_cache=None, boost=None, snippets=False):
+ filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
if filters is None: filters = []
if tokens_cache is None: tokens_cache = {}
tokens = self.get_tokens(searched, field, cached=tokens_cache)
- query = self.make_phrase(tokens, field=field, fuzzy=fuzzy)
+ query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
if book:
filters.append(self.term_filter(Term('is_book', 'true')))
top = self.searcher.search(query, self.chain_filters(filters), max_results)
return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
- filters=None, tokens_cache=None, boost=None):
+ filters=None, tokens_cache=None, boost=None, snippets=True):
if filters is None: filters = []
if tokens_cache is None: tokens_cache = {}
top = self.searcher.search(query, self.chain_filters(filters), max_results)
return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
- snippets=self.get_snippets(found, query)) for found in top.scoreDocs]
+ snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
"""