Fix parse.py: make dirs and add missing metadata.

author Radek Czajka <rczajka@rczajka.pl>

Wed, 21 Sep 2016 08:12:35 +0000 (10:12 +0200)

committer Radek Czajka <rczajka@rczajka.pl>

Wed, 21 Sep 2016 08:12:35 +0000 (10:12 +0200)
author Radek Czajka <rczajka@rczajka.pl>
Wed, 21 Sep 2016 08:12:35 +0000 (10:12 +0200)
committer Radek Czajka <rczajka@rczajka.pl>
Wed, 21 Sep 2016 08:12:35 +0000 (10:12 +0200)
diff --git a/parse.py b/parse.py

index e1296c6..dbf483e 100755 (executable)
--- a/parse.py
+++ b/parse.py
@@ -6,7 +6,7 @@ import re
  from urllib import unquote
  from urllib2 import urlopen
  from lxml.html import etree
-from fnpdjango.utils.text.slughifi import slughifi
+from slugify import slugify
  
  
  fin = urlopen('http://ofop.redakcja.wolnelektury.pl/documents/book/kurs-ip-dla-uniwersytetow/html')
@@ -75,31 +75,39 @@ for target in root.findall(".//td"):
  for tag in root:
      if tag.tag == 'h2':
          print etree.tostring(tag)
-        rozdzial = slughifi(tag.text)
-        for f in os.listdir('content/import/%s' % rozdzial):
-            if f.endswith('.html'):
-                    os.unlink('content/import/%s/%s' % (rozdzial, f))
+        rozdzial = slugify(tag.text)
+        dir_rozdzial = 'content/import/%s' % rozdzial
+        if os.path.exists(dir_rozdzial):
+            for f in os.listdir(dir_rozdzial):
+                if f.endswith('.html'):
+                    os.unlink('%s/%s' % (dir_rozdzial, f))
+        else:
+            os.makedirs(dir_rozdzial)
          podrozdzial = None
          podrozdzial_n = 0
      elif tag.tag == 'h3':
          tytul = tag.text
          tytul = re.sub('^[0-9\. ]+', '', tytul).strip()
-        slug = slughifi(tytul.split(':')[0])
+        slug = slugify(tytul.split(':')[0])
          if podrozdzial:
              podrozdzial.close()
          podrozdzial_n += 1
-        podrozdzial = open('content/import/%s/%s.html' % (rozdzial, slug), 'w')
+        podrozdzial = open('%s/%s.html' % (dir_rozdzial, slug), 'w')
          podrozdzial.write((u'''---
  title: "%s"
+section: %s
+listable: true
+extends: text.j2
+default_block: main
  order: %d
  ---
  
-''' % (tytul, podrozdzial_n)).encode('utf-8'))
+''' % (tytul, rozdzial, podrozdzial_n)).encode('utf-8'))
      elif podrozdzial:
          if tag.tag == 'h4':
              tag.text = re.sub('^[0-9\. ]+', '', tag.text).strip()
              tag.tag = 'h2'
-            slug = slughifi(tag.text)
+            slug = slugify(tag.text)
              etree.SubElement(tag, "a", {'class': 'permalink',
                  'id': slug, 'href': '#' + slug,
                  'title': 'Link do tego miejsca'})
@@ -109,4 +117,4 @@ order: %d
          podrozdzial.write(etree.tostring(tag, encoding='utf-8'))
  
  if podrozdzial:
-    podrozdzial.close()
-\ No newline at end of file
+    podrozdzial.close()
author	Radek Czajka <rczajka@rczajka.pl>
	Wed, 21 Sep 2016 08:12:35 +0000 (10:12 +0200)
committer	Radek Czajka <rczajka@rczajka.pl>
	Wed, 21 Sep 2016 08:12:35 +0000 (10:12 +0200)