cleaning around ebooks
[wolnelektury.git] / apps / lesmianator / management / commands / lesmianator.py
index 36d7144..5412bee 100644 (file)
@@ -2,6 +2,7 @@
 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
+import re
 import sys
 from cPickle import load, dump
 from optparse import make_option
 import sys
 from cPickle import load, dump
 from optparse import make_option
@@ -12,6 +13,9 @@ from django.conf import settings
 
 from catalogue.models import Book, Tag
 
 
 from catalogue.models import Book, Tag
 
+# extract text from text file
+re_text = re.compile(r'\n{3,}(.*?)\n*-----\n', re.S).search
+
 
 class Command(BaseCommand):
     option_list = BaseCommand.option_list + (
 
 class Command(BaseCommand):
     option_list = BaseCommand.option_list + (
@@ -62,17 +66,22 @@ class Command(BaseCommand):
                     print self.style.NOTICE('%s has no TXT file' % book.slug)
                 skipped += 1
                 continue
                     print self.style.NOTICE('%s has no TXT file' % book.slug)
                 skipped += 1
                 continue
+            f = open(book.txt_file.path)
+            m = re_text(f.read())
+            if not m:
+                print self.style.ERROR("Unknown text format: %s" % book.slug)
+                skipped += 1
+                continue
+
             processed += 1
             last_word = ''
             processed += 1
             last_word = ''
-            for number, line in enumerate(book.txt_file):
-                if number < 17:
-                    continue
-                line = unicode(line, 'utf-8').lower()
-                for letter in line:
-                    mydict = lesmianator.setdefault(last_word, {})
-                    myval = mydict.setdefault(letter, 0)
-                    mydict[letter] += 1
-                    last_word = last_word[-2:] + letter
+            text = unicode(m.group(1), 'utf-8').lower()
+            for letter in text:
+                mydict = lesmianator.setdefault(last_word, {})
+                myval = mydict.setdefault(letter, 0)
+                mydict[letter] += 1
+                last_word = last_word[-2:] + letter
+            f.close()
 
         if not processed:
             if skipped:
 
         if not processed:
             if skipped:
@@ -84,10 +93,10 @@ class Command(BaseCommand):
         try:
             dump(lesmianator, open(path, 'w'))
         except:
         try:
             dump(lesmianator, open(path, 'w'))
         except:
-            print self.style.ERROR("Counldn't write to $s" % path)
+            print self.style.ERROR("Couldn't write to $s" % path)
             return
 
         dump(lesmianator, open(path, 'w'))
         if verbose >= 1:
             print "%d processed, %d skipped" % (processed, skipped)
             return
 
         dump(lesmianator, open(path, 'w'))
         if verbose >= 1:
             print "%d processed, %d skipped" % (processed, skipped)
-            print "Results dumped do %s" % path 
+            print "Results dumped to %s" % path