7 from librarian.parser import WLDocument
 
   9 _BASE = ur"""http://wiki.wolnepodreczniki.pl/(?:index.php\?title=)?Lektury(?::|/)"""
 
  12     ur"""%s(?P<title>[^/]+)/?$""" % _BASE,
 
  13     ur"""%s(?P<author>[^/]+)/(?P<title>[^/]+)/?$""" % _BASE,
 
  14     ur"""%s(?P<author>[^/]+)/(?P<collection>[^/]+)/(?P<title>[^/]+)/?$""" % _BASE,
 
  15     ur"""%s(?P<author>[^/]+)/(?P<collection>[^/]+)/(?P<part>[^/]+)/(?P<title>[^/]+)/?$""" % _BASE,
 
  18 def compile_patterns(patterns):
 
  20         yield re.compile(p, re.UNICODE)
 
  22 def match_first(text, patterns):
 
  23     for pattern in patterns:
 
  24         m = pattern.match(text)
 
  33         self.documents = set()
 
  35         self.unrecognized = {}
 
  37         self.about_patterns = list(compile_patterns(ABOUT_PATTERNS))
 
  39         assert match_first("""http://wiki.wolnepodreczniki.pl/index.php?title=Lektury:Mickiewicz/%C5%9Amier%C4%87_Pu%C5%82kownika/""", self.about_patterns)
 
  40         assert match_first("""http://wiki.wolnepodreczniki.pl/Lektury:Anonim/Ala""", self.about_patterns)
 
  41         assert match_first("""http://wiki.wolnepodreczniki.pl/Lektury:Karpi%C5%84ski/Sielanki/Powr%C3%B3t_z_Warszawy_na_wie%C5%9B""", self.about_patterns)
 
  43     def read_file(self, path):
 
  44         return WLDocument.from_file(path)
 
  47         for file in os.listdir(u"."):
 
  49                 doc = self.read_file(file)
 
  50                 about_link = unicode(doc.book_info.about)
 
  51                 url = doc.book_info.url
 
  54                         self.invalid.add(file)
 
  56                     self.unrecognized[file] = url
 
  59                 m = match_first(about_link, self.about_patterns)
 
  61                     if m in self.documents:
 
  62                         l = self.duplicates.get(m, [])
 
  64                         self.duplicates[m] = l
 
  68                     self.unrecognized[file] = about_link
 
  70                 self.invalid.add(file)
 
  75 {0} correct documents, 
 
  80 \t{duplicates}""".format(
 
  83             len(self.unrecognized),
 
  85             duplicates='\n\t'.join(repr(x) for x in self.duplicates.items()),
 
  86             unrecognized='\n\t'.join(repr(x) for x in self.unrecognized.items())
 
  89         for doc in self.documents:
 
  90             print u"http://redakcja.wolnelektury.pl/documents/{0}".format('/'.join(doc).lower())
 
  93 if __name__ == '__main__':