lesmianator.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Copyright © 2010,2011 Fundacja Nowoczesna Polska
   4 #
   5 # This file is part of Leśmianator.
   6 #
   7 # Leśmianator is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU Affero General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Leśmianator is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU Affero General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU Affero General Public License
  18 # along with Leśmianator.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20
  21 """
  22 Leśmianator - program generujący wiersze na życzenie.
  23
  24 Wiersz generowany jest według następującego algorytmu:
  25 Każdy kolejny znak jest losowany zgodnie z wyznaczoną wcześniej częstością
  26 występowania znaków w tekstach źródłowych w kontekście trzech poprzednich
  27 znaków.
  28
  29 Przykładowo, jeśli dotąd wygenerowaliśmy ciąg "Litw", to bierzemy pod uwagę
  30 ciąg "itw". Jeśli w plikach źródłowych np. dwa razy pojawił się ciąg "itwo"
  31 i raz ciąg "itwa", to kolejny znak losujemy między 'o' (z prawdopodobieństwem
  32 2/3) a 'a' (z prawdopodobieństwem 1/3).
  33
  34 Wszystkie litery w plikach źródłowych są najpierw zamieniane na małe.
  35 Białe znaki traktowane są tak samo, jak wszystkie pozostałe.
  36
  37 Leśmianator kończy pracę wraz z ukończeniem strofy (tj. wstawieniem pustej
  38 linii), o ile napisał co najmniej dwa niepuste wersy (w przeciwnych przypadku
  39 zaczyna kolejną strofę).
  40
  41 """
  42
  43 from collections import Counter, defaultdict
  44 from os.path import abspath, dirname, join
  45 import cPickle as pickle
  46 from random import randint
  47 import re
  48
  49
  50 class Lesmianator(object):
  51
  52     SAMPLE_LENGTH = 3
  53     MIN_LINES = 2
  54     MAX_LEN = 1000
  55
  56     DATA_FILE = join(dirname(abspath(__file__)), 'data.p')
  57
  58
  59     def __init__(self):
  60         self.continuations = defaultdict(Counter)
  61
  62     def load(self):
  63         """Ładuje wyniki analizy z pliku."""
  64         with open(self.DATA_FILE) as f:
  65             self.continuations = pickle.load(f)
  66
  67     def save(self):
  68         """Zapisuje wyniki analizy do pliku."""
  69         with open(self.DATA_FILE, 'w') as f:
  70             pickle.dump(self.continuations, f)
  71
  72     def add_text(self, text):
  73         """Wykonuje właściwą analizę tekstu źródłowego.
  74
  75         Zamienia tekst na małe litery i dla każdego znaku zapisuje, po jakim
  76         ciągu trzech znaków wystąpił.
  77
  78         """
  79         last_word = ''
  80         text = unicode(text, 'utf-8').lower()
  81         for letter in text:
  82             self.continuations[last_word][letter] += 1
  83             last_word = last_word[-self.SAMPLE_LENGTH + 1:] + letter
  84
  85     re_txt_file = re.compile(r'\n{3,}(.*?)\n*-----\n', re.S).search
  86     def add_txt_file(self, txt_file):
  87         """Dodaje plik tekstowy do analizy.
  88
  89         Pliki tekstowe z Wolnych Lektur zawierają na początku nazwisko
  90         autora, tytuł i podtytuł utworu, następnie kilka pustych linii,
  91         tekst utworu i oddzieloną kreskami stopkę.  Funkcja wyciana z tego
  92         sam goły tekst i przekazuje go do właściwej analizy statystycznej.
  93
  94         """
  95         m = self.re_txt_file(txt_file.read())
  96         self.add_text(m.group(1))
  97
  98     def choose_letter(self, word):
  99         """Losuje kolejny znak wiersza."""
 100         if word not in self.continuations:
 101             return u'\n'
 102
 103         choices = sum((self.continuations[word][letter]
 104                        for letter in self.continuations[word]))
 105         r = randint(0, choices - 1)
 106
 107         for letter in self.continuations[word]:
 108             r -= self.continuations[word][letter]
 109             if r < 0:
 110                 return letter
 111
 112     def __call__(self):
 113         """Zwraca wygenerowany wiersz."""
 114         letters = []
 115         word = u''
 116
 117         finished_stanza_verses = 0
 118         current_stanza_verses = 0
 119         verse_start = True
 120
 121         char_count = 0
 122
 123         # kończy pracę, jeśli ukończone strofy zawierają co najmniej
 124         # dwie niepuste linie
 125         while finished_stanza_verses < self.MIN_LINES and char_count < self.MAX_LEN:
 126             letter = self.choose_letter(word)
 127             letters.append(letter)
 128             word = word[-self.SAMPLE_LENGTH + 1:] + letter
 129             char_count += 1
 130
 131             if letter == u'\n':
 132                 if verse_start:
 133                     finished_stanza_verses += current_stanza_verses
 134                     current_stanza_verses = 0
 135                 else:
 136                     current_stanza_verses += 1
 137                     verse_start = True
 138             else:
 139                 verse_start = False
 140
 141         return ''.join(letters).strip()
 142
 143
 144 if __name__ == '__main__':
 145     poet = Lesmianator()
 146     poet.load()
 147     print poet()