7 from librarian.parser import WLDocument
9 _BASE = ur"""http://wiki.wolnepodreczniki.pl/(?:index.php\?title=)?Lektury(?::|/)"""
12 ur"""%s(?P<title>[^/]+)/?$""" % _BASE,
13 ur"""%s(?P<author>[^/]+)/(?P<title>[^/]+)/?$""" % _BASE,
14 ur"""%s(?P<author>[^/]+)/(?P<collection>[^/]+)/(?P<title>[^/]+)/?$""" % _BASE,
15 ur"""%s(?P<author>[^/]+)/(?P<collection>[^/]+)/(?P<part>[^/]+)/(?P<title>[^/]+)/?$""" % _BASE,
18 def compile_patterns(patterns):
20 yield re.compile(p, re.UNICODE)
22 def match_first(text, patterns):
23 for pattern in patterns:
24 m = pattern.match(text)
33 self.documents = set()
35 self.unrecognized = {}
37 self.about_patterns = list(compile_patterns(ABOUT_PATTERNS))
39 assert match_first("""http://wiki.wolnepodreczniki.pl/index.php?title=Lektury:Mickiewicz/%C5%9Amier%C4%87_Pu%C5%82kownika/""", self.about_patterns)
40 assert match_first("""http://wiki.wolnepodreczniki.pl/Lektury:Anonim/Ala""", self.about_patterns)
41 assert match_first("""http://wiki.wolnepodreczniki.pl/Lektury:Karpi%C5%84ski/Sielanki/Powr%C3%B3t_z_Warszawy_na_wie%C5%9B""", self.about_patterns)
43 def read_file(self, path):
44 return WLDocument.from_file(path)
47 for file in os.listdir(u"."):
49 doc = self.read_file(file)
50 about_link = unicode(doc.book_info.about)
51 url = doc.book_info.url
54 self.invalid.add(file)
56 self.unrecognized[file] = url
59 m = match_first(about_link, self.about_patterns)
61 if m in self.documents:
62 l = self.duplicates.get(m, [])
64 self.duplicates[m] = l
68 self.unrecognized[file] = about_link
70 self.invalid.add(file)
75 {0} correct documents,
80 \t{duplicates}""".format(
83 len(self.unrecognized),
85 duplicates='\n\t'.join(repr(x) for x in self.duplicates.items()),
86 unrecognized='\n\t'.join(repr(x) for x in self.unrecognized.items())
89 for doc in self.documents:
90 print u"http://redakcja.wolnelektury.pl/documents/{0}".format('/'.join(doc).lower())
93 if __name__ == '__main__':