X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/util/makeEuroparlLineFile.py diff --git a/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/util/makeEuroparlLineFile.py b/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/util/makeEuroparlLineFile.py new file mode 100644 index 0000000..2cfda33 --- /dev/null +++ b/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/util/makeEuroparlLineFile.py @@ -0,0 +1,137 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import glob +import datetime +import tarfile +import re + +try: + sys.argv.remove('-verbose') + VERBOSE = True +except ValueError: + VERBOSE = False + +try: + sys.argv.remove('-docPerParagraph') + docPerParagraph = True +except ValueError: + docPerParagraph = False + +reChapterOnly = re.compile('^$') +reTagOnly = re.compile('^<.*?>$') +reNumberOnly = re.compile(r'^\d+\.?$') + +docCount = 0 +didEnglish = False + +def write(date, title, pending, fOut): + global docCount + body = ' '.join(pending).replace('\t', ' ').strip() + if len(body) > 0: + line = '%s\t%s\t%s\n' % (title, date, body) + fOut.write(line) + docCount += 1 + del pending[:] + if VERBOSE: + print len(body) + +def processTar(fileName, fOut): + + global didEnglish + + t = tarfile.open(fileName, 'r:gz') + for ti in t: + if ti.isfile() and (not didEnglish or ti.name.find('/en/') == -1): + + tup = ti.name.split('/') + lang = tup[1] + year = int(tup[2][3:5]) + if year < 20: + year += 2000 + else: + year += 1900 + + month = int(tup[2][6:8]) + day = int(tup[2][9:11]) + date = datetime.date(year=year, month=month, day=day) + + if VERBOSE: + print + print '%s: %s' % (ti.name, date) + nextIsTitle = False + title = None + pending = [] + for line in t.extractfile(ti).readlines(): + line = line.strip() + if reChapterOnly.match(line) is not None: + if title is not None: + write(date, title, pending, fOut) + nextIsTitle = True + continue + if nextIsTitle: + if not reNumberOnly.match(line) and not reTagOnly.match(line): + title = line + nextIsTitle = False + if VERBOSE: + print ' title %s' % line + continue + if line.lower() == '

': + if docPerParagraph: + write(date, title, pending, fOut) + else: + pending.append('PARSEP') + elif not reTagOnly.match(line): + pending.append(line) + if title is not None and len(pending) > 0: + write(date, title, pending, fOut) + + didEnglish = True + +# '/x/lucene/data/europarl/all.lines.txt' +dirIn = sys.argv[1] +fileOut = sys.argv[2] + +fOut = open(fileOut, 'wb') + +for fileName in glob.glob('%s/??-??.tgz' % dirIn): + if fileName.endswith('.tgz'): + print 'process %s; %d docs so far...' % (fileName, docCount) + processTar(fileName, fOut) + +print 'TOTAL: %s' % docCount + +#run something like this: +""" + +# Europarl V5 makes 76,917 docs, avg 38.6 KB per +python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt +shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/full.lines.txt +rm /x/lucene/data/europarl/tmp.lines.txt + +# Run again, this time each paragraph is a doc: +# Europarl V5 makes 5,607,746 paragraphs (one paragraph per line), avg 620 bytes per: +python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt -docPerParagraph +shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/para.lines.txt +rm /x/lucene/data/europarl/tmp.lines.txt + +# ~5.5 MB gzip'd: +head -200 /x/lucene/data/europarl/full.lines.txt > tmp.txt +head -10000 /x/lucene/data/europarl/para.lines.txt >> tmp.txt +shuf tmp.txt > europarl.subset.txt +rm -f tmp.txt +gzip --best europarl.subset.txt +"""