--- /dev/null
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import glob
+import datetime
+import tarfile
+import re
+
+try:
+ sys.argv.remove('-verbose')
+ VERBOSE = True
+except ValueError:
+ VERBOSE = False
+
+try:
+ sys.argv.remove('-docPerParagraph')
+ docPerParagraph = True
+except ValueError:
+ docPerParagraph = False
+
+reChapterOnly = re.compile('^<CHAPTER ID=.*?>$')
+reTagOnly = re.compile('^<.*?>$')
+reNumberOnly = re.compile(r'^\d+\.?$')
+
+docCount = 0
+didEnglish = False
+
+def write(date, title, pending, fOut):
+ global docCount
+ body = ' '.join(pending).replace('\t', ' ').strip()
+ if len(body) > 0:
+ line = '%s\t%s\t%s\n' % (title, date, body)
+ fOut.write(line)
+ docCount += 1
+ del pending[:]
+ if VERBOSE:
+ print len(body)
+
+def processTar(fileName, fOut):
+
+ global didEnglish
+
+ t = tarfile.open(fileName, 'r:gz')
+ for ti in t:
+ if ti.isfile() and (not didEnglish or ti.name.find('/en/') == -1):
+
+ tup = ti.name.split('/')
+ lang = tup[1]
+ year = int(tup[2][3:5])
+ if year < 20:
+ year += 2000
+ else:
+ year += 1900
+
+ month = int(tup[2][6:8])
+ day = int(tup[2][9:11])
+ date = datetime.date(year=year, month=month, day=day)
+
+ if VERBOSE:
+ print
+ print '%s: %s' % (ti.name, date)
+ nextIsTitle = False
+ title = None
+ pending = []
+ for line in t.extractfile(ti).readlines():
+ line = line.strip()
+ if reChapterOnly.match(line) is not None:
+ if title is not None:
+ write(date, title, pending, fOut)
+ nextIsTitle = True
+ continue
+ if nextIsTitle:
+ if not reNumberOnly.match(line) and not reTagOnly.match(line):
+ title = line
+ nextIsTitle = False
+ if VERBOSE:
+ print ' title %s' % line
+ continue
+ if line.lower() == '<p>':
+ if docPerParagraph:
+ write(date, title, pending, fOut)
+ else:
+ pending.append('PARSEP')
+ elif not reTagOnly.match(line):
+ pending.append(line)
+ if title is not None and len(pending) > 0:
+ write(date, title, pending, fOut)
+
+ didEnglish = True
+
+# '/x/lucene/data/europarl/all.lines.txt'
+dirIn = sys.argv[1]
+fileOut = sys.argv[2]
+
+fOut = open(fileOut, 'wb')
+
+for fileName in glob.glob('%s/??-??.tgz' % dirIn):
+ if fileName.endswith('.tgz'):
+ print 'process %s; %d docs so far...' % (fileName, docCount)
+ processTar(fileName, fOut)
+
+print 'TOTAL: %s' % docCount
+
+#run something like this:
+"""
+
+# Europarl V5 makes 76,917 docs, avg 38.6 KB per
+python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt
+shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/full.lines.txt
+rm /x/lucene/data/europarl/tmp.lines.txt
+
+# Run again, this time each paragraph is a doc:
+# Europarl V5 makes 5,607,746 paragraphs (one paragraph per line), avg 620 bytes per:
+python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt -docPerParagraph
+shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/para.lines.txt
+rm /x/lucene/data/europarl/tmp.lines.txt
+
+# ~5.5 MB gzip'd:
+head -200 /x/lucene/data/europarl/full.lines.txt > tmp.txt
+head -10000 /x/lucene/data/europarl/para.lines.txt >> tmp.txt
+shuf tmp.txt > europarl.subset.txt
+rm -f tmp.txt
+gzip --best europarl.subset.txt
+"""