Added DCMeta - EAV based application to represent document meta-data. Started to...
[redakcja.git] / scripts / fix_links.py
1 #!/usr/bin/env python
2 import argparse
3 import os
4 import sys
5 import re
6
7 from librarian.parser import WLDocument
8
9 _BASE = ur"""http://wiki.wolnepodreczniki.pl/(?:index.php\?title=)?Lektury(?::|/)"""
10
11 ABOUT_PATTERNS = (
12     ur"""%s(?P<title>[^/]+)/?$""" % _BASE,
13     ur"""%s(?P<author>[^/]+)/(?P<title>[^/]+)/?$""" % _BASE,
14     ur"""%s(?P<author>[^/]+)/(?P<collection>[^/]+)/(?P<title>[^/]+)/?$""" % _BASE,
15     ur"""%s(?P<author>[^/]+)/(?P<collection>[^/]+)/(?P<part>[^/]+)/(?P<title>[^/]+)/?$""" % _BASE,
16 )
17
18 def compile_patterns(patterns):
19     for p in patterns:
20         yield re.compile(p, re.UNICODE)
21
22 def match_first(text, patterns):
23     for pattern in patterns:
24         m = pattern.match(text)
25         if m is not None:
26             return m.groups()
27     return False
28
29
30 class Task(object):
31
32     def __init__(self):
33         self.documents = set()
34         self.invalid = set()
35         self.unrecognized = {}
36         self.duplicates = {}
37         self.about_patterns = list(compile_patterns(ABOUT_PATTERNS))
38
39         assert match_first("""http://wiki.wolnepodreczniki.pl/index.php?title=Lektury:Mickiewicz/%C5%9Amier%C4%87_Pu%C5%82kownika/""", self.about_patterns)
40         assert match_first("""http://wiki.wolnepodreczniki.pl/Lektury:Anonim/Ala""", self.about_patterns)
41         assert match_first("""http://wiki.wolnepodreczniki.pl/Lektury:Karpi%C5%84ski/Sielanki/Powr%C3%B3t_z_Warszawy_na_wie%C5%9B""", self.about_patterns)
42
43     def read_file(self, path):
44         return WLDocument.from_file(path)
45
46     def run(self):
47         for file in os.listdir(u"."):
48             try:
49                 doc = self.read_file(file)
50                 about_link = unicode(doc.book_info.about)
51                 url = doc.book_info.url
52                 if not about_link:
53                     if not url:
54                         self.invalid.add(file)
55                         continue
56                     self.unrecognized[file] = url
57                     continue
58
59                 m = match_first(about_link, self.about_patterns)
60                 if m:
61                     if m in self.documents:
62                         l = self.duplicates.get(m, [])
63                         l.append(file)
64                         self.duplicates[m] = l
65                     else:
66                         self.documents.add(m)
67                 else:
68                     self.unrecognized[file] = about_link
69             except Exception:
70                 self.invalid.add(file)
71
72
73
74         print u"""\
75 {0} correct documents, 
76 {1} invalid,
77 {2} unrecognized,
78 \t{unrecognized}
79 {3} duplicate names
80 \t{duplicates}""".format(
81             len(self.documents),
82             len(self.invalid),
83             len(self.unrecognized),
84             len(self.duplicates),
85             duplicates='\n\t'.join(repr(x) for x in self.duplicates.items()),
86             unrecognized='\n\t'.join(repr(x) for x in self.unrecognized.items())
87         )
88
89         for doc in self.documents:
90             print u"http://redakcja.wolnelektury.pl/documents/{0}".format('/'.join(doc).lower())
91
92
93 if __name__ == '__main__':
94
95     task = Task()
96     task.run()