2 from __future__ import unicode_literals
6 from urllib import unquote
7 from urllib2 import urlopen
8 from lxml.html import etree
9 from fnpdjango.utils.text.slughifi import slughifi
12 fin = urlopen('http://ofop.redakcja.wolnelektury.pl/documents/book/kurs-ip-dla-uniwersytetow/html')
13 parser = etree.HTMLParser()
14 tree = etree.parse(fin, parser)
16 root = tree.xpath("//div[@id='book-text']")[0]
22 parent = tag.getparent()
24 prev = tag.getprevious()
26 prev.tail = (prev.tail or '') + tag.tail
28 parent.text = (parent.text or '') + tag.tail
33 return re.sub(r'(\s|^)(\w)\s+', r'\1\2\u00a0', text)
35 for tag in root.iter():
37 tag.text = spojniki(tag.text)
39 tag.tail = spojniki(tag.tail)
41 for ilustr in root.findall('.//img'):
42 if not ilustr.attrib['src'].endswith(('.png', '.jpg')):
43 ilustr.attrib['src'] = ilustr.attrib['src'].rsplit('.', 1)[0] + '.jpg'
44 ilustr.attrib['src'] = "{{ media_url('/ilustr/" + unquote(ilustr.attrib['src']) + "') }}"
45 for target in root.findall(".//a[@class='target']"):
47 for target in root.findall(".//a[@class='anchor']"):
49 for target in root.findall(".//a"):
50 if target.attrib.get('class', '').startswith('sec'):
53 for target in root.findall(".//punkt"):
54 if target.getparent().tag not in ('ol', 'ul'):
55 print etree.tostring(target)
56 for target in root.findall(".//ol/*"):
57 if target.tag != 'li':
58 print etree.tostring(target)
59 for target in root.findall(".//ul/*"):
60 if target.tag != 'li':
61 print etree.tostring(target)
62 for target in root.findall(".//table/*"):
63 if target.tag != 'tr':
64 print etree.tostring(target)
65 for target in root.findall(".//tr"):
66 if target.getparent().tag != 'table':
67 print etree.tostring(target)
68 for target in root.findall(".//tr/*"):
69 if target.tag != 'td':
70 print etree.tostring(target)
71 for target in root.findall(".//td"):
72 if target.getparent().tag != 'tr':
73 print etree.tostring(target)
77 print etree.tostring(tag)
78 rozdzial = slughifi(tag.text)
79 for f in os.listdir('content/import/%s' % rozdzial):
80 if f.endswith('.html'):
81 os.unlink('content/import/%s/%s' % (rozdzial, f))
86 tytul = re.sub('^[0-9\. ]+', '', tytul).strip()
87 slug = slughifi(tytul.split(':')[0])
91 podrozdzial = open('content/import/%s/%s.html' % (rozdzial, slug), 'w')
92 podrozdzial.write((u'''---
97 ''' % (tytul, podrozdzial_n)).encode('utf-8'))
100 tag.text = re.sub('^[0-9\. ]+', '', tag.text).strip()
102 slug = slughifi(tag.text)
103 etree.SubElement(tag, "a", {'class': 'permalink',
104 'id': slug, 'href': '#' + slug,
105 'title': 'Link do tego miejsca'})
106 elif tag.tag == 'h5':
107 tag.text = re.sub('^[0-9\. ]+', '', tag.text).strip()
109 podrozdzial.write(etree.tostring(tag, encoding='utf-8'))