1 # Licensed to the Apache Software Foundation (ASF) under one or more
2 # contributor license agreements. See the NOTICE file distributed with
3 # this work for additional information regarding copyright ownership.
4 # The ASF licenses this file to You under the Apache License, Version 2.0
5 # (the "License"); you may not use this file except in compliance with
6 # the License. You may obtain a copy of the License at
8 # http://www.apache.org/licenses/LICENSE-2.0
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
23 sys.argv.remove('-verbose')
29 sys.argv.remove('-docPerParagraph')
30 docPerParagraph = True
32 docPerParagraph = False
34 reChapterOnly = re.compile('^<CHAPTER ID=.*?>$')
35 reTagOnly = re.compile('^<.*?>$')
36 reNumberOnly = re.compile(r'^\d+\.?$')
41 def write(date, title, pending, fOut):
43 body = ' '.join(pending).replace('\t', ' ').strip()
45 line = '%s\t%s\t%s\n' % (title, date, body)
52 def processTar(fileName, fOut):
56 t = tarfile.open(fileName, 'r:gz')
58 if ti.isfile() and (not didEnglish or ti.name.find('/en/') == -1):
60 tup = ti.name.split('/')
62 year = int(tup[2][3:5])
68 month = int(tup[2][6:8])
69 day = int(tup[2][9:11])
70 date = datetime.date(year=year, month=month, day=day)
74 print '%s: %s' % (ti.name, date)
78 for line in t.extractfile(ti).readlines():
80 if reChapterOnly.match(line) is not None:
82 write(date, title, pending, fOut)
86 if not reNumberOnly.match(line) and not reTagOnly.match(line):
90 print ' title %s' % line
92 if line.lower() == '<p>':
94 write(date, title, pending, fOut)
96 pending.append('PARSEP')
97 elif not reTagOnly.match(line):
99 if title is not None and len(pending) > 0:
100 write(date, title, pending, fOut)
104 # '/x/lucene/data/europarl/all.lines.txt'
106 fileOut = sys.argv[2]
108 fOut = open(fileOut, 'wb')
110 for fileName in glob.glob('%s/??-??.tgz' % dirIn):
111 if fileName.endswith('.tgz'):
112 print 'process %s; %d docs so far...' % (fileName, docCount)
113 processTar(fileName, fOut)
115 print 'TOTAL: %s' % docCount
117 #run something like this:
120 # Europarl V5 makes 76,917 docs, avg 38.6 KB per
121 python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt
122 shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/full.lines.txt
123 rm /x/lucene/data/europarl/tmp.lines.txt
125 # Run again, this time each paragraph is a doc:
126 # Europarl V5 makes 5,607,746 paragraphs (one paragraph per line), avg 620 bytes per:
127 python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt -docPerParagraph
128 shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/para.lines.txt
129 rm /x/lucene/data/europarl/tmp.lines.txt
132 head -200 /x/lucene/data/europarl/full.lines.txt > tmp.txt
133 head -10000 /x/lucene/data/europarl/para.lines.txt >> tmp.txt
134 shuf tmp.txt > europarl.subset.txt
136 gzip --best europarl.subset.txt