PyLucene 3.4.0-1 import
[pylucene.git] / samples / LuceneInAction / lia / util / Streams.py
1 # ====================================================================
2 #   Licensed under the Apache License, Version 2.0 (the "License");
3 #   you may not use this file except in compliance with the License.
4 #   You may obtain a copy of the License at
5 #
6 #       http://www.apache.org/licenses/LICENSE-2.0
7 #
8 #   Unless required by applicable law or agreed to in writing, software
9 #   distributed under the License is distributed on an "AS IS" BASIS,
10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 #   See the License for the specific language governing permissions and
12 #   limitations under the License.
13 # ====================================================================
14
15 from StringIO import StringIO
16 from HTMLParser import HTMLParser
17
18
19 class InputStreamReader(object):
20
21     def __init__(self, inputStream, encoding):
22
23         super(InputStreamReader, self).__init__()
24         self.inputStream = inputStream
25         self.encoding = encoding or 'utf-8'
26
27     def _read(self, length):
28
29         return self.inputStream.read(length)
30
31     def read(self, length=-1):
32
33         text = self._read(length)
34         text = unicode(text, self.encoding)
35
36         return text
37
38     def close(self):
39
40         self.inputStream.close()
41
42
43 class HTMLReader(object):
44
45     def __init__(self, reader):
46
47         self.reader = reader
48
49         class htmlParser(HTMLParser):
50
51             def __init__(self):
52
53                 HTMLParser.__init__(self)
54
55                 self.buffer = StringIO()
56                 self.position = 0
57
58             def handle_data(self, data):
59
60                 self.buffer.write(data)
61
62             def _read(self, length):
63
64                 buffer = self.buffer
65                 size = buffer.tell() - self.position
66
67                 if length > 0 and size > length:
68                     buffer.seek(self.position)
69                     data = buffer.read(length)
70                     self.position += len(data)
71                     buffer.seek(0, 2)
72
73                 elif size > 0:
74                     buffer.seek(self.position)
75                     data = buffer.read(size)
76                     self.position = 0
77                     buffer.seek(0)
78
79                 else:
80                     data = ''
81
82                 return data
83                 
84         self.parser = htmlParser()
85
86     def read(self, length=-1):
87
88         while True:
89             data = self.reader.read(length)
90             if len(data) > 0:
91                 self.parser.feed(data)
92                 data = self.parser._read(length)
93                 if len(data) == 0:
94                     continue
95             return data
96
97     def close(self):
98
99         self.reader.close()