2 from __future__ import unicode_literals
6 from urllib import unquote
7 from urllib2 import urlopen
8 from lxml.html import etree
9 from slugify import slugify
12 fin = urlopen('http://ofop.redakcja.wolnelektury.pl/documents/book/kurs-ip-dla-uniwersytetow/html')
13 parser = etree.HTMLParser()
14 tree = etree.parse(fin, parser)
16 root = tree.xpath("//div[@id='book-text']")[0]
22 parent = tag.getparent()
24 prev = tag.getprevious()
26 prev.tail = (prev.tail or '') + tag.tail
28 parent.text = (parent.text or '') + tag.tail
33 return re.sub(r'(\s|^)(\w)\s+', r'\1\2\u00a0', text)
35 for tag in root.iter():
37 tag.text = spojniki(tag.text)
39 tag.tail = spojniki(tag.tail)
41 for ilustr in root.findall('.//img'):
42 if not ilustr.attrib['src'].endswith(('.png', '.jpg')):
43 ilustr.attrib['src'] = ilustr.attrib['src'].rsplit('.', 1)[0] + '.jpg'
44 ilustr.attrib['src'] = "{{ media_url('/ilustr/" + unquote(ilustr.attrib['src']) + "') }}"
45 for target in root.findall(".//a[@class='target']"):
47 for target in root.findall(".//a[@class='anchor']"):
49 for target in root.findall(".//a"):
50 if target.attrib.get('class', '').startswith('sec'):
53 for target in root.findall(".//punkt"):
54 if target.getparent().tag not in ('ol', 'ul'):
55 print etree.tostring(target)
56 for target in root.findall(".//ol/*"):
57 if target.tag != 'li':
58 print etree.tostring(target)
59 for target in root.findall(".//ul/*"):
60 if target.tag != 'li':
61 print etree.tostring(target)
62 for target in root.findall(".//table/*"):
63 if target.tag != 'tr':
64 print etree.tostring(target)
65 for target in root.findall(".//tr"):
66 if target.getparent().tag != 'table':
67 print etree.tostring(target)
68 for target in root.findall(".//tr/*"):
69 if target.tag != 'td':
70 print etree.tostring(target)
71 for target in root.findall(".//td"):
72 if target.getparent().tag != 'tr':
73 print etree.tostring(target)
77 print etree.tostring(tag)
78 rozdzial = slugify(tag.text)
79 dir_rozdzial = 'content/import/%s' % rozdzial
80 if os.path.exists(dir_rozdzial):
81 for f in os.listdir(dir_rozdzial):
82 if f.endswith('.html'):
83 os.unlink('%s/%s' % (dir_rozdzial, f))
85 os.makedirs(dir_rozdzial)
90 tytul = re.sub('^[0-9\. ]+', '', tytul).strip()
91 slug = slugify(tytul.split(':')[0])
95 podrozdzial = open('%s/%s.html' % (dir_rozdzial, slug), 'w')
96 podrozdzial.write((u'''---
105 ''' % (tytul, rozdzial, podrozdzial_n)).encode('utf-8'))
108 tag.text = re.sub('^[0-9\. ]+', '', tag.text).strip()
110 slug = slugify(tag.text)
111 etree.SubElement(tag, "a", {'class': 'permalink',
112 'id': slug, 'href': '#' + slug,
113 'title': 'Link do tego miejsca'})
114 elif tag.tag == 'h5':
115 tag.text = re.sub('^[0-9\. ]+', '', tag.text).strip()
117 podrozdzial.write(etree.tostring(tag, encoding='utf-8'))