+++ /dev/null
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import glob
-import datetime
-import tarfile
-import re
-
-try:
- sys.argv.remove('-verbose')
- VERBOSE = True
-except ValueError:
- VERBOSE = False
-
-try:
- sys.argv.remove('-docPerParagraph')
- docPerParagraph = True
-except ValueError:
- docPerParagraph = False
-
-reChapterOnly = re.compile('^<CHAPTER ID=.*?>$')
-reTagOnly = re.compile('^<.*?>$')
-reNumberOnly = re.compile(r'^\d+\.?$')
-
-docCount = 0
-didEnglish = False
-
-def write(date, title, pending, fOut):
- global docCount
- body = ' '.join(pending).replace('\t', ' ').strip()
- if len(body) > 0:
- line = '%s\t%s\t%s\n' % (title, date, body)
- fOut.write(line)
- docCount += 1
- del pending[:]
- if VERBOSE:
- print len(body)
-
-def processTar(fileName, fOut):
-
- global didEnglish
-
- t = tarfile.open(fileName, 'r:gz')
- for ti in t:
- if ti.isfile() and (not didEnglish or ti.name.find('/en/') == -1):
-
- tup = ti.name.split('/')
- lang = tup[1]
- year = int(tup[2][3:5])
- if year < 20:
- year += 2000
- else:
- year += 1900
-
- month = int(tup[2][6:8])
- day = int(tup[2][9:11])
- date = datetime.date(year=year, month=month, day=day)
-
- if VERBOSE:
- print
- print '%s: %s' % (ti.name, date)
- nextIsTitle = False
- title = None
- pending = []
- for line in t.extractfile(ti).readlines():
- line = line.strip()
- if reChapterOnly.match(line) is not None:
- if title is not None:
- write(date, title, pending, fOut)
- nextIsTitle = True
- continue
- if nextIsTitle:
- if not reNumberOnly.match(line) and not reTagOnly.match(line):
- title = line
- nextIsTitle = False
- if VERBOSE:
- print ' title %s' % line
- continue
- if line.lower() == '<p>':
- if docPerParagraph:
- write(date, title, pending, fOut)
- else:
- pending.append('PARSEP')
- elif not reTagOnly.match(line):
- pending.append(line)
- if title is not None and len(pending) > 0:
- write(date, title, pending, fOut)
-
- didEnglish = True
-
-# '/x/lucene/data/europarl/all.lines.txt'
-dirIn = sys.argv[1]
-fileOut = sys.argv[2]
-
-fOut = open(fileOut, 'wb')
-
-for fileName in glob.glob('%s/??-??.tgz' % dirIn):
- if fileName.endswith('.tgz'):
- print 'process %s; %d docs so far...' % (fileName, docCount)
- processTar(fileName, fOut)
-
-print 'TOTAL: %s' % docCount
-
-#run something like this:
-"""
-
-# Europarl V5 makes 76,917 docs, avg 38.6 KB per
-python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt
-shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/full.lines.txt
-rm /x/lucene/data/europarl/tmp.lines.txt
-
-# Run again, this time each paragraph is a doc:
-# Europarl V5 makes 5,607,746 paragraphs (one paragraph per line), avg 620 bytes per:
-python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt -docPerParagraph
-shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/para.lines.txt
-rm /x/lucene/data/europarl/tmp.lines.txt
-
-# ~5.5 MB gzip'd:
-head -200 /x/lucene/data/europarl/full.lines.txt > tmp.txt
-head -10000 /x/lucene/data/europarl/para.lines.txt >> tmp.txt
-shuf tmp.txt > europarl.subset.txt
-rm -f tmp.txt
-gzip --best europarl.subset.txt
-"""