lucene-java-3.4.0/lucene/backwards/src/test/org/apache/lucene/util/makeEuroparlLineFile.py

   1 # Licensed to the Apache Software Foundation (ASF) under one or more
   2 # contributor license agreements.  See the NOTICE file distributed with
   3 # this work for additional information regarding copyright ownership.
   4 # The ASF licenses this file to You under the Apache License, Version 2.0
   5 # (the "License"); you may not use this file except in compliance with
   6 # the License.  You may obtain a copy of the License at
   7 #
   8 #     http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 # Unless required by applicable law or agreed to in writing, software
  11 # distributed under the License is distributed on an "AS IS" BASIS,
  12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 # See the License for the specific language governing permissions and
  14 # limitations under the License.
  15
  16 import sys
  17 import glob
  18 import datetime
  19 import tarfile
  20 import re
  21
  22 try:
  23   sys.argv.remove('-verbose')
  24   VERBOSE = True
  25 except ValueError:
  26   VERBOSE = False
  27
  28 try:
  29   sys.argv.remove('-docPerParagraph')
  30   docPerParagraph = True
  31 except ValueError:
  32   docPerParagraph = False
  33
  34 reChapterOnly = re.compile('^<CHAPTER ID=.*?>$')
  35 reTagOnly = re.compile('^<.*?>$')
  36 reNumberOnly = re.compile(r'^\d+\.?$')
  37
  38 docCount = 0
  39 didEnglish = False
  40
  41 def write(date, title, pending, fOut):
  42   global docCount
  43   body = ' '.join(pending).replace('\t', ' ').strip()
  44   if len(body) > 0:
  45     line = '%s\t%s\t%s\n' % (title, date, body)
  46     fOut.write(line)
  47     docCount += 1
  48     del pending[:]
  49     if VERBOSE:
  50       print len(body)
  51
  52 def processTar(fileName, fOut):
  53
  54   global didEnglish
  55
  56   t = tarfile.open(fileName, 'r:gz')
  57   for ti in t:
  58     if ti.isfile() and (not didEnglish or ti.name.find('/en/') == -1):
  59
  60       tup = ti.name.split('/')
  61       lang = tup[1]
  62       year = int(tup[2][3:5])
  63       if year < 20:
  64         year += 2000
  65       else:
  66         year += 1900
  67
  68       month = int(tup[2][6:8])
  69       day = int(tup[2][9:11])
  70       date = datetime.date(year=year, month=month, day=day)
  71
  72       if VERBOSE:
  73         print
  74         print '%s: %s' % (ti.name, date)
  75       nextIsTitle = False
  76       title = None
  77       pending = []
  78       for line in t.extractfile(ti).readlines():
  79         line = line.strip()
  80         if reChapterOnly.match(line) is not None:
  81           if title is not None:
  82             write(date, title, pending, fOut)
  83           nextIsTitle = True
  84           continue
  85         if nextIsTitle:
  86           if not reNumberOnly.match(line) and not reTagOnly.match(line):
  87             title = line
  88             nextIsTitle = False
  89             if VERBOSE:
  90               print '  title %s' % line
  91           continue
  92         if line.lower() == '<p>':
  93           if docPerParagraph:
  94             write(date, title, pending, fOut)
  95           else:
  96             pending.append('PARSEP')
  97         elif not reTagOnly.match(line):
  98           pending.append(line)
  99       if title is not None and len(pending) > 0:
 100         write(date, title, pending, fOut)
 101
 102   didEnglish = True
 103
 104 # '/x/lucene/data/europarl/all.lines.txt'
 105 dirIn = sys.argv[1]
 106 fileOut = sys.argv[2]
 107
 108 fOut = open(fileOut, 'wb')
 109
 110 for fileName in glob.glob('%s/??-??.tgz' % dirIn):
 111   if fileName.endswith('.tgz'):
 112     print 'process %s; %d docs so far...' % (fileName, docCount)
 113     processTar(fileName, fOut)
 114
 115 print 'TOTAL: %s' % docCount
 116
 117 #run something like this:
 118 """
 119
 120 # Europarl V5 makes 76,917 docs, avg 38.6 KB per
 121 python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt
 122 shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/full.lines.txt
 123 rm /x/lucene/data/europarl/tmp.lines.txt
 124
 125 # Run again, this time each paragraph is a doc:
 126 # Europarl V5 makes 5,607,746 paragraphs (one paragraph per line), avg 620 bytes per:
 127 python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt -docPerParagraph
 128 shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/para.lines.txt
 129 rm /x/lucene/data/europarl/tmp.lines.txt
 130
 131 # ~5.5 MB gzip'd:
 132 head -200 /x/lucene/data/europarl/full.lines.txt > tmp.txt
 133 head -10000 /x/lucene/data/europarl/para.lines.txt >> tmp.txt
 134 shuf tmp.txt > europarl.subset.txt
 135 rm -f tmp.txt
 136 gzip --best europarl.subset.txt
 137 """