src/sources/ocr.py

   1 from lxml import etree
   2
   3
   4 def add_page_to_master(master, ocr_filename):
   5     """ Simplest implementation: just dump text to an akap. """
   6     with open(ocr_filename) as f:
   7         txt = f.read()
   8
   9     txt = txt.strip()
  10
  11     if len(master):
  12         master[-1].tail = (master[-1].tail or '') + '\n\n' + txt + '\n\n'
  13     else:
  14         master.text = (master.text or '') + '\n\n' + txt + '\n\n'
  15
  16
  17 def add_page_to_master_as_stanzas(master, ocr_filename):
  18     """ Simplest implementation: just dump text to an akap. """
  19     with open(ocr_filename) as f:
  20         txt = f.read()
  21
  22     strofa = etree.SubElement(master, 'strofa')
  23     strofa.text="\n"
  24     for piece in txt.split('\n'):
  25         if not piece.strip(): continue
  26         strofa.text += piece + '/\n'
  27
  28
  29 def add_page_to_master_as_p(master, ocr_filename):
  30     """ Simplest implementation: just dump text to an akap. """
  31     with open(ocr_filename) as f:
  32         txt = f.read()
  33
  34     for piece in txt.strip().split('\n\n'):
  35         if not piece.strip(): continue
  36         p = etree.SubElement(master, 'akap')
  37         p.text = piece