2 """Universal feed parser
4 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
6 Visit http://feedparser.org/ for the latest version
7 Visit http://feedparser.org/docs/ for the latest documentation
9 Required: Python 2.1 or later
10 Recommended: Python 2.3 or later
11 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
14 __version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs"
15 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
17 Redistribution and use in source and binary forms, with or without modification,
18 are permitted provided that the following conditions are met:
20 * Redistributions of source code must retain the above copyright notice,
21 this list of conditions and the following disclaimer.
22 * Redistributions in binary form must reproduce the above copyright notice,
23 this list of conditions and the following disclaimer in the documentation
24 and/or other materials provided with the distribution.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE."""
37 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
38 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
39 "John Beimler <http://john.beimler.org/>",
40 "Fazal Majid <http://www.majid.info/mylos/weblog/>",
41 "Aaron Swartz <http://aaronsw.com/>",
42 "Kevin Marks <http://epeus.blogspot.com/>"]
45 # HTTP "User-Agent" header to send to servers when downloading feeds.
46 # If you are embedding feedparser in a larger application, you should
47 # change this to your application name and URL.
48 USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
50 # HTTP "Accept" header to send to servers when downloading feeds. If you don't
51 # want to send an Accept header, set this to None.
52 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
54 # List of preferred XML parsers, by SAX driver name. These will be tried first,
55 # but if they're not installed, Python will keep searching through its own list
56 # of pre-installed parsers until it finds one that supports everything we need.
57 PREFERRED_XML_PARSERS = ["drv_libxml2"]
59 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
60 # this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
61 # or utidylib <http://utidylib.berlios.de/>.
64 # List of Python interfaces for HTML Tidy, in order of preference. Only useful
66 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
68 # ---------- required modules (should come with any Python distribution) ----------
69 import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
71 from cStringIO import StringIO as _StringIO
73 from StringIO import StringIO as _StringIO
75 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
77 # gzip is included with most Python distributions, but may not be available if you compiled your own
87 # If a real XML parser is available, feedparser will attempt to use it. feedparser has
88 # been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
89 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
90 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
93 xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
94 from xml.sax.saxutils import escape as _xmlescape
99 data = data.replace('&', '&')
100 data = data.replace('>', '>')
101 data = data.replace('<', '<')
104 # base64 support for Atom feeds that contain embedded binary data
106 import base64, binascii
108 base64 = binascii = None
110 # cjkcodecs and iconv_codec provide support for more character encodings.
111 # Both are available from http://cjkpython.i18n.org/
113 import cjkcodecs.aliases
121 # chardet library auto-detects character encodings
122 # Download from http://chardet.feedparser.org/
126 import chardet.constants
127 chardet.constants._debug = 1
131 # ---------- don't touch these ----------
132 class ThingsNobodyCaresAboutButMe(Exception): pass
133 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
134 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
135 class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
136 class UndeclaredNamespace(Exception): pass
138 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
139 sgmllib.special = re.compile('<!')
140 sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
142 SUPPORTED_VERSIONS = {'': 'unknown',
143 'rss090': 'RSS 0.90',
144 'rss091n': 'RSS 0.91 (Netscape)',
145 'rss091u': 'RSS 0.91 (Userland)',
146 'rss092': 'RSS 0.92',
147 'rss093': 'RSS 0.93',
148 'rss094': 'RSS 0.94',
151 'rss': 'RSS (unknown version)',
152 'atom01': 'Atom 0.1',
153 'atom02': 'Atom 0.2',
154 'atom03': 'Atom 0.3',
155 'atom10': 'Atom 1.0',
156 'atom': 'Atom (unknown version)',
164 # Python 2.1 does not have dict
165 from UserDict import UserDict
172 class FeedParserDict(UserDict):
173 keymap = {'channel': 'feed',
177 'date_parsed': 'updated_parsed',
178 'description': ['subtitle', 'summary'],
180 'modified': 'updated',
181 'modified_parsed': 'updated_parsed',
182 'issued': 'published',
183 'issued_parsed': 'published_parsed',
184 'copyright': 'rights',
185 'copyright_detail': 'rights_detail',
186 'tagline': 'subtitle',
187 'tagline_detail': 'subtitle_detail'}
188 def __getitem__(self, key):
189 if key == 'category':
190 return UserDict.__getitem__(self, 'tags')[0]['term']
191 if key == 'categories':
192 return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
193 realkey = self.keymap.get(key, key)
194 if type(realkey) == types.ListType:
196 if UserDict.has_key(self, k):
197 return UserDict.__getitem__(self, k)
198 if UserDict.has_key(self, key):
199 return UserDict.__getitem__(self, key)
200 return UserDict.__getitem__(self, realkey)
202 def __setitem__(self, key, value):
203 for k in self.keymap.keys():
206 if type(key) == types.ListType:
208 return UserDict.__setitem__(self, key, value)
210 def get(self, key, default=None):
211 if self.has_key(key):
216 def setdefault(self, key, value):
217 if not self.has_key(key):
221 def has_key(self, key):
223 return hasattr(self, key) or UserDict.has_key(self, key)
224 except AttributeError:
227 def __getattr__(self, key):
229 return self.__dict__[key]
233 assert not key.startswith('_')
234 return self.__getitem__(key)
236 raise AttributeError, "object has no attribute '%s'" % key
238 def __setattr__(self, key, value):
239 if key.startswith('_') or key == 'data':
240 self.__dict__[key] = value
242 return self.__setitem__(key, value)
244 def __contains__(self, key):
245 return self.has_key(key)
247 def zopeCompatibilityHack():
248 global FeedParserDict
250 def FeedParserDict(aDict=None):
256 _ebcdic_to_ascii_map = None
257 def _ebcdic_to_ascii(s):
258 global _ebcdic_to_ascii_map
259 if not _ebcdic_to_ascii_map:
261 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
262 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
263 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
264 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
265 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
266 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
267 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
268 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
269 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
270 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
271 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
272 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
273 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
274 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
275 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
276 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
279 _ebcdic_to_ascii_map = string.maketrans( \
280 ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
281 return s.translate(_ebcdic_to_ascii_map)
283 _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
284 def _urljoin(base, uri):
285 uri = _urifixer.sub(r'\1\3', uri)
286 return urlparse.urljoin(base, uri)
288 class _FeedParserMixin:
289 namespaces = {'': '',
290 'http://backend.userland.com/rss': '',
291 'http://blogs.law.harvard.edu/tech/rss': '',
292 'http://purl.org/rss/1.0/': '',
293 'http://my.netscape.com/rdf/simple/0.9/': '',
294 'http://example.com/newformat#': '',
295 'http://example.com/necho': '',
296 'http://purl.org/echo/': '',
297 'uri/of/echo/namespace#': '',
298 'http://purl.org/pie/': '',
299 'http://purl.org/atom/ns#': '',
300 'http://www.w3.org/2005/Atom': '',
301 'http://purl.org/rss/1.0/modules/rss091#': '',
303 'http://webns.net/mvcb/': 'admin',
304 'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
305 'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
306 'http://media.tangent.org/rss/1.0/': 'audio',
307 'http://backend.userland.com/blogChannelModule': 'blogChannel',
308 'http://web.resource.org/cc/': 'cc',
309 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
310 'http://purl.org/rss/1.0/modules/company': 'co',
311 'http://purl.org/rss/1.0/modules/content/': 'content',
312 'http://my.theinfo.org/changed/1.0/rss/': 'cp',
313 'http://purl.org/dc/elements/1.1/': 'dc',
314 'http://purl.org/dc/terms/': 'dcterms',
315 'http://purl.org/rss/1.0/modules/email/': 'email',
316 'http://purl.org/rss/1.0/modules/event/': 'ev',
317 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
318 'http://freshmeat.net/rss/fm/': 'fm',
319 'http://xmlns.com/foaf/0.1/': 'foaf',
320 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
321 'http://postneo.com/icbm/': 'icbm',
322 'http://purl.org/rss/1.0/modules/image/': 'image',
323 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
324 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
325 'http://purl.org/rss/1.0/modules/link/': 'l',
326 'http://search.yahoo.com/mrss': 'media',
327 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
328 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
329 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
330 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
331 'http://purl.org/rss/1.0/modules/reference/': 'ref',
332 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
333 'http://purl.org/rss/1.0/modules/search/': 'search',
334 'http://purl.org/rss/1.0/modules/slash/': 'slash',
335 'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
336 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
337 'http://hacks.benhammersley.com/rss/streaming/': 'str',
338 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
339 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
340 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
341 'http://purl.org/rss/1.0/modules/threading/': 'thr',
342 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
343 'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
344 'http://wellformedweb.org/commentAPI/': 'wfw',
345 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
346 'http://www.w3.org/1999/xhtml': 'xhtml',
347 'http://www.w3.org/XML/1998/namespace': 'xml',
348 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf'
350 _matchnamespaces = {}
352 can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
353 can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
354 can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
355 html_types = ['text/html', 'application/xhtml+xml']
357 def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
358 if _debug: sys.stderr.write('initializing FeedParser\n')
359 if not self._matchnamespaces:
360 for k, v in self.namespaces.items():
361 self._matchnamespaces[k.lower()] = v
362 self.feeddata = FeedParserDict() # feed-level data
363 self.encoding = encoding # character encoding
364 self.entries = [] # list of entry-level data
365 self.version = '' # feed type/version, see SUPPORTED_VERSIONS
366 self.namespacesInUse = {} # dictionary of namespaces defined by the feed
368 # the following are used internally to track state;
369 # this is really out of control and should be refactored
376 self.incontributor = 0
379 self.sourcedata = FeedParserDict()
380 self.contentparams = FeedParserDict()
381 self._summaryKey = None
382 self.namespacemap = {}
383 self.elementstack = []
386 self.baseuri = baseuri or ''
387 self.lang = baselang or None
389 self.feeddata['language'] = baselang
391 def unknown_starttag(self, tag, attrs):
392 if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
394 attrs = [(k.lower(), v) for k, v in attrs]
395 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
397 # track xml:base and xml:lang
399 baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
400 self.baseuri = _urljoin(self.baseuri, baseuri)
401 lang = attrsD.get('xml:lang', attrsD.get('lang'))
403 # xml:lang could be explicitly set to '', we need to capture that
406 # if no xml:lang is specified, use parent lang
409 if tag in ('feed', 'rss', 'rdf:RDF'):
410 self.feeddata['language'] = lang
412 self.basestack.append(self.baseuri)
413 self.langstack.append(lang)
416 for prefix, uri in attrs:
417 if prefix.startswith('xmlns:'):
418 self.trackNamespace(prefix[6:], uri)
419 elif prefix == 'xmlns':
420 self.trackNamespace(None, uri)
422 # track inline content
423 if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
424 # element declared itself as escaped markup, but it isn't really
425 self.contentparams['type'] = 'application/xhtml+xml'
426 if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
427 # Note: probably shouldn't simply recreate localname here, but
428 # our namespace handling isn't actually 100% correct in cases where
429 # the feed redefines the default namespace (which is actually
430 # the usual case for inline content, thanks Sam), so here we
431 # cheat and just reconstruct the element based on localname
432 # because that compensates for the bugs in our namespace handling.
433 # This will horribly munge inline content with non-empty qnames,
434 # but nobody actually does that, so I'm not fixing it.
435 tag = tag.split(':')[-1]
436 return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
439 if tag.find(':') <> -1:
440 prefix, suffix = tag.split(':', 1)
442 prefix, suffix = '', tag
443 prefix = self.namespacemap.get(prefix, prefix)
445 prefix = prefix + '_'
447 # special hack for better tracking of empty textinput/image elements in illformed feeds
448 if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
450 if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
453 # call special handler (if defined) or default handler
454 methodname = '_start_' + prefix + suffix
456 method = getattr(self, methodname)
457 return method(attrsD)
458 except AttributeError:
459 return self.push(prefix + suffix, 1)
461 def unknown_endtag(self, tag):
462 if _debug: sys.stderr.write('end %s\n' % tag)
464 if tag.find(':') <> -1:
465 prefix, suffix = tag.split(':', 1)
467 prefix, suffix = '', tag
468 prefix = self.namespacemap.get(prefix, prefix)
470 prefix = prefix + '_'
472 # call special handler (if defined) or default handler
473 methodname = '_end_' + prefix + suffix
475 method = getattr(self, methodname)
477 except AttributeError:
478 self.pop(prefix + suffix)
480 # track inline content
481 if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
482 # element declared itself as escaped markup, but it isn't really
483 self.contentparams['type'] = 'application/xhtml+xml'
484 if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
485 tag = tag.split(':')[-1]
486 self.handle_data('</%s>' % tag, escape=0)
488 # track xml:base and xml:lang going out of scope
491 if self.basestack and self.basestack[-1]:
492 self.baseuri = self.basestack[-1]
495 if self.langstack: # and (self.langstack[-1] is not None):
496 self.lang = self.langstack[-1]
498 def handle_charref(self, ref):
499 # called for each character reference, e.g. for ' ', ref will be '160'
500 if not self.elementstack: return
502 if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
509 text = unichr(c).encode('utf-8')
510 self.elementstack[-1][2].append(text)
512 def handle_entityref(self, ref):
513 # called for each entity reference, e.g. for '©', ref will be 'copy'
514 if not self.elementstack: return
515 if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
516 if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
519 # entity resolution graciously donated by Aaron Swartz
521 import htmlentitydefs
522 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
523 return htmlentitydefs.name2codepoint[k]
524 k = htmlentitydefs.entitydefs[k]
525 if k.startswith('&#') and k.endswith(';'):
526 return int(k[2:-1]) # not in latin-1
529 except KeyError: text = '&%s;' % ref
530 else: text = unichr(name2cp(ref)).encode('utf-8')
531 self.elementstack[-1][2].append(text)
533 def handle_data(self, text, escape=1):
534 # called for each block of plain text, i.e. outside of any tag and
535 # not containing any character or entity references
536 if not self.elementstack: return
537 if escape and self.contentparams.get('type') == 'application/xhtml+xml':
538 text = _xmlescape(text)
539 self.elementstack[-1][2].append(text)
541 def handle_comment(self, text):
542 # called for each comment, e.g. <!-- insert message here -->
545 def handle_pi(self, text):
546 # called for each processing instruction, e.g. <?instruction>
549 def handle_decl(self, text):
552 def parse_declaration(self, i):
553 # override internal declaration handler to handle CDATA blocks
554 if _debug: sys.stderr.write('entering parse_declaration\n')
555 if self.rawdata[i:i+9] == '<![CDATA[':
556 k = self.rawdata.find(']]>', i)
557 if k == -1: k = len(self.rawdata)
558 self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
561 k = self.rawdata.find('>', i)
564 def mapContentType(self, contentType):
565 contentType = contentType.lower()
566 if contentType == 'text':
567 contentType = 'text/plain'
568 elif contentType == 'html':
569 contentType = 'text/html'
570 elif contentType == 'xhtml':
571 contentType = 'application/xhtml+xml'
574 def trackNamespace(self, prefix, uri):
575 loweruri = uri.lower()
576 if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
577 self.version = 'rss090'
578 if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
579 self.version = 'rss10'
580 if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
581 self.version = 'atom10'
582 if loweruri.find('backend.userland.com/rss') <> -1:
583 # match any backend.userland.com namespace
584 uri = 'http://backend.userland.com/rss'
586 if self._matchnamespaces.has_key(loweruri):
587 self.namespacemap[prefix] = self._matchnamespaces[loweruri]
588 self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
590 self.namespacesInUse[prefix or ''] = uri
592 def resolveURI(self, uri):
593 return _urljoin(self.baseuri or '', uri)
595 def decodeEntities(self, element, data):
598 def push(self, element, expectingText):
599 self.elementstack.append([element, expectingText, []])
601 def pop(self, element, stripWhitespace=1):
602 if not self.elementstack: return
603 if self.elementstack[-1][0] != element: return
605 element, expectingText, pieces = self.elementstack.pop()
606 output = ''.join(pieces)
608 output = output.strip()
609 if not expectingText: return output
611 # decode base64 content
612 if base64 and self.contentparams.get('base64', 0):
614 output = base64.decodestring(output)
615 except binascii.Error:
617 except binascii.Incomplete:
620 # resolve relative URIs
621 if (element in self.can_be_relative_uri) and output:
622 output = self.resolveURI(output)
624 # decode entities within embedded markup
625 if not self.contentparams.get('base64', 0):
626 output = self.decodeEntities(element, output)
628 # remove temporary cruft from contentparams
630 del self.contentparams['mode']
634 del self.contentparams['base64']
638 # resolve relative URIs within embedded markup
639 if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
640 if element in self.can_contain_relative_uris:
641 output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
643 # sanitize embedded markup
644 if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
645 if element in self.can_contain_dangerous_markup:
646 output = _sanitizeHTML(output, self.encoding)
648 if self.encoding and type(output) != type(u''):
650 output = unicode(output, self.encoding)
654 # categories/tags/keywords/whatever are handled in _end_category
655 if element == 'category':
658 # store output in appropriate place(s)
659 if self.inentry and not self.insource:
660 if element == 'content':
661 self.entries[-1].setdefault(element, [])
662 contentparams = copy.deepcopy(self.contentparams)
663 contentparams['value'] = output
664 self.entries[-1][element].append(contentparams)
665 elif element == 'link':
666 self.entries[-1][element] = output
668 self.entries[-1]['links'][-1]['href'] = output
670 if element == 'description':
672 self.entries[-1][element] = output
674 contentparams = copy.deepcopy(self.contentparams)
675 contentparams['value'] = output
676 self.entries[-1][element + '_detail'] = contentparams
677 elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
678 context = self._getContext()
679 if element == 'description':
681 context[element] = output
682 if element == 'link':
683 context['links'][-1]['href'] = output
685 contentparams = copy.deepcopy(self.contentparams)
686 contentparams['value'] = output
687 context[element + '_detail'] = contentparams
690 def pushContent(self, tag, attrsD, defaultContentType, expectingText):
692 self.contentparams = FeedParserDict({
693 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
694 'language': self.lang,
695 'base': self.baseuri})
696 self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
697 self.push(tag, expectingText)
699 def popContent(self, tag):
700 value = self.pop(tag)
702 self.contentparams.clear()
705 def _mapToStandardPrefix(self, name):
706 colonpos = name.find(':')
708 prefix = name[:colonpos]
709 suffix = name[colonpos+1:]
710 prefix = self.namespacemap.get(prefix, prefix)
711 name = prefix + ':' + suffix
714 def _getAttribute(self, attrsD, name):
715 return attrsD.get(self._mapToStandardPrefix(name))
717 def _isBase64(self, attrsD, contentparams):
718 if attrsD.get('mode', '') == 'base64':
720 if self.contentparams['type'].startswith('text/'):
722 if self.contentparams['type'].endswith('+xml'):
724 if self.contentparams['type'].endswith('/xml'):
728 def _itsAnHrefDamnIt(self, attrsD):
729 href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
739 attrsD['href'] = href
742 def _save(self, key, value):
743 context = self._getContext()
744 context.setdefault(key, value)
746 def _start_rss(self, attrsD):
747 versionmap = {'0.91': 'rss091u',
752 attr_version = attrsD.get('version', '')
753 version = versionmap.get(attr_version)
755 self.version = version
756 elif attr_version.startswith('2.'):
757 self.version = 'rss20'
761 def _start_dlhottitles(self, attrsD):
762 self.version = 'hotrss'
764 def _start_channel(self, attrsD):
766 self._cdf_common(attrsD)
767 _start_feedinfo = _start_channel
769 def _cdf_common(self, attrsD):
770 if attrsD.has_key('lastmod'):
771 self._start_modified({})
772 self.elementstack[-1][-1] = attrsD['lastmod']
774 if attrsD.has_key('href'):
776 self.elementstack[-1][-1] = attrsD['href']
779 def _start_feed(self, attrsD):
781 versionmap = {'0.1': 'atom01',
785 attr_version = attrsD.get('version')
786 version = versionmap.get(attr_version)
788 self.version = version
790 self.version = 'atom'
792 def _end_channel(self):
794 _end_feed = _end_channel
796 def _start_image(self, attrsD):
798 self.push('image', 0)
799 context = self._getContext()
800 context.setdefault('image', FeedParserDict())
802 def _end_image(self):
806 def _start_textinput(self, attrsD):
808 self.push('textinput', 0)
809 context = self._getContext()
810 context.setdefault('textinput', FeedParserDict())
811 _start_textInput = _start_textinput
813 def _end_textinput(self):
814 self.pop('textinput')
816 _end_textInput = _end_textinput
818 def _start_author(self, attrsD):
820 self.push('author', 1)
821 _start_managingeditor = _start_author
822 _start_dc_author = _start_author
823 _start_dc_creator = _start_author
824 _start_itunes_author = _start_author
826 def _end_author(self):
829 self._sync_author_detail()
830 _end_managingeditor = _end_author
831 _end_dc_author = _end_author
832 _end_dc_creator = _end_author
833 _end_itunes_author = _end_author
835 def _start_itunes_owner(self, attrsD):
837 self.push('publisher', 0)
839 def _end_itunes_owner(self):
840 self.pop('publisher')
842 self._sync_author_detail('publisher')
844 def _start_contributor(self, attrsD):
845 self.incontributor = 1
846 context = self._getContext()
847 context.setdefault('contributors', [])
848 context['contributors'].append(FeedParserDict())
849 self.push('contributor', 0)
851 def _end_contributor(self):
852 self.pop('contributor')
853 self.incontributor = 0
855 def _start_dc_contributor(self, attrsD):
856 self.incontributor = 1
857 context = self._getContext()
858 context.setdefault('contributors', [])
859 context['contributors'].append(FeedParserDict())
862 def _end_dc_contributor(self):
864 self.incontributor = 0
866 def _start_name(self, attrsD):
868 _start_itunes_name = _start_name
871 value = self.pop('name')
873 self._save_author('name', value, 'publisher')
875 self._save_author('name', value)
876 elif self.incontributor:
877 self._save_contributor('name', value)
878 elif self.intextinput:
879 context = self._getContext()
880 context['textinput']['name'] = value
881 _end_itunes_name = _end_name
883 def _start_width(self, attrsD):
884 self.push('width', 0)
886 def _end_width(self):
887 value = self.pop('width')
893 context = self._getContext()
894 context['image']['width'] = value
896 def _start_height(self, attrsD):
897 self.push('height', 0)
899 def _end_height(self):
900 value = self.pop('height')
906 context = self._getContext()
907 context['image']['height'] = value
909 def _start_url(self, attrsD):
911 _start_homepage = _start_url
912 _start_uri = _start_url
915 value = self.pop('href')
917 self._save_author('href', value)
918 elif self.incontributor:
919 self._save_contributor('href', value)
921 context = self._getContext()
922 context['image']['href'] = value
923 elif self.intextinput:
924 context = self._getContext()
925 context['textinput']['link'] = value
926 _end_homepage = _end_url
929 def _start_email(self, attrsD):
930 self.push('email', 0)
931 _start_itunes_email = _start_email
933 def _end_email(self):
934 value = self.pop('email')
936 self._save_author('email', value, 'publisher')
938 self._save_author('email', value)
939 elif self.incontributor:
940 self._save_contributor('email', value)
941 _end_itunes_email = _end_email
943 def _getContext(self):
945 context = self.sourcedata
947 context = self.entries[-1]
949 context = self.feeddata
952 def _save_author(self, key, value, prefix='author'):
953 context = self._getContext()
954 context.setdefault(prefix + '_detail', FeedParserDict())
955 context[prefix + '_detail'][key] = value
956 self._sync_author_detail()
958 def _save_contributor(self, key, value):
959 context = self._getContext()
960 context.setdefault('contributors', [FeedParserDict()])
961 context['contributors'][-1][key] = value
963 def _sync_author_detail(self, key='author'):
964 context = self._getContext()
965 detail = context.get('%s_detail' % key)
967 name = detail.get('name')
968 email = detail.get('email')
970 context[key] = '%s (%s)' % (name, email)
976 author = context.get(key)
977 if not author: return
978 emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
979 if not emailmatch: return
980 email = emailmatch.group(0)
981 # probably a better way to do the following, but it passes all the tests
982 author = author.replace(email, '')
983 author = author.replace('()', '')
984 author = author.strip()
985 if author and (author[0] == '('):
987 if author and (author[-1] == ')'):
989 author = author.strip()
990 context.setdefault('%s_detail' % key, FeedParserDict())
991 context['%s_detail' % key]['name'] = author
992 context['%s_detail' % key]['email'] = email
994 def _start_subtitle(self, attrsD):
995 self.pushContent('subtitle', attrsD, 'text/plain', 1)
996 _start_tagline = _start_subtitle
997 _start_itunes_subtitle = _start_subtitle
999 def _end_subtitle(self):
1000 self.popContent('subtitle')
1001 _end_tagline = _end_subtitle
1002 _end_itunes_subtitle = _end_subtitle
1004 def _start_rights(self, attrsD):
1005 self.pushContent('rights', attrsD, 'text/plain', 1)
1006 _start_dc_rights = _start_rights
1007 _start_copyright = _start_rights
1009 def _end_rights(self):
1010 self.popContent('rights')
1011 _end_dc_rights = _end_rights
1012 _end_copyright = _end_rights
1014 def _start_item(self, attrsD):
1015 self.entries.append(FeedParserDict())
1016 self.push('item', 0)
1019 id = self._getAttribute(attrsD, 'rdf:about')
1021 context = self._getContext()
1023 self._cdf_common(attrsD)
1024 _start_entry = _start_item
1025 _start_product = _start_item
1027 def _end_item(self):
1030 _end_entry = _end_item
1032 def _start_dc_language(self, attrsD):
1033 self.push('language', 1)
1034 _start_language = _start_dc_language
1036 def _end_dc_language(self):
1037 self.lang = self.pop('language')
1038 _end_language = _end_dc_language
1040 def _start_dc_publisher(self, attrsD):
1041 self.push('publisher', 1)
1042 _start_webmaster = _start_dc_publisher
1044 def _end_dc_publisher(self):
1045 self.pop('publisher')
1046 self._sync_author_detail('publisher')
1047 _end_webmaster = _end_dc_publisher
1049 def _start_published(self, attrsD):
1050 self.push('published', 1)
1051 _start_dcterms_issued = _start_published
1052 _start_issued = _start_published
1054 def _end_published(self):
1055 value = self.pop('published')
1056 self._save('published_parsed', _parse_date(value))
1057 _end_dcterms_issued = _end_published
1058 _end_issued = _end_published
1060 def _start_updated(self, attrsD):
1061 self.push('updated', 1)
1062 _start_modified = _start_updated
1063 _start_dcterms_modified = _start_updated
1064 _start_pubdate = _start_updated
1065 _start_dc_date = _start_updated
1067 def _end_updated(self):
1068 value = self.pop('updated')
1069 parsed_value = _parse_date(value)
1070 self._save('updated_parsed', parsed_value)
1071 _end_modified = _end_updated
1072 _end_dcterms_modified = _end_updated
1073 _end_pubdate = _end_updated
1074 _end_dc_date = _end_updated
1076 def _start_created(self, attrsD):
1077 self.push('created', 1)
1078 _start_dcterms_created = _start_created
1080 def _end_created(self):
1081 value = self.pop('created')
1082 self._save('created_parsed', _parse_date(value))
1083 _end_dcterms_created = _end_created
1085 def _start_expirationdate(self, attrsD):
1086 self.push('expired', 1)
1088 def _end_expirationdate(self):
1089 self._save('expired_parsed', _parse_date(self.pop('expired')))
1091 def _start_cc_license(self, attrsD):
1092 self.push('license', 1)
1093 value = self._getAttribute(attrsD, 'rdf:resource')
1095 self.elementstack[-1][2].append(value)
1098 def _start_creativecommons_license(self, attrsD):
1099 self.push('license', 1)
1101 def _end_creativecommons_license(self):
1104 def _addTag(self, term, scheme, label):
1105 context = self._getContext()
1106 tags = context.setdefault('tags', [])
1107 if (not term) and (not scheme) and (not label): return
1108 value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1109 if value not in tags:
1110 tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
1112 def _start_category(self, attrsD):
1113 if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1114 term = attrsD.get('term')
1115 scheme = attrsD.get('scheme', attrsD.get('domain'))
1116 label = attrsD.get('label')
1117 self._addTag(term, scheme, label)
1118 self.push('category', 1)
1119 _start_dc_subject = _start_category
1120 _start_keywords = _start_category
1122 def _end_itunes_keywords(self):
1123 for term in self.pop('itunes_keywords').split():
1124 self._addTag(term, 'http://www.itunes.com/', None)
1126 def _start_itunes_category(self, attrsD):
1127 self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1128 self.push('category', 1)
1130 def _end_category(self):
1131 value = self.pop('category')
1132 if not value: return
1133 context = self._getContext()
1134 tags = context['tags']
1135 if value and len(tags) and not tags[-1]['term']:
1136 tags[-1]['term'] = value
1138 self._addTag(value, None, None)
1139 _end_dc_subject = _end_category
1140 _end_keywords = _end_category
1141 _end_itunes_category = _end_category
1143 def _start_cloud(self, attrsD):
1144 self._getContext()['cloud'] = FeedParserDict(attrsD)
1146 def _start_link(self, attrsD):
1147 attrsD.setdefault('rel', 'alternate')
1148 attrsD.setdefault('type', 'text/html')
1149 attrsD = self._itsAnHrefDamnIt(attrsD)
1150 if attrsD.has_key('href'):
1151 attrsD['href'] = self.resolveURI(attrsD['href'])
1152 expectingText = self.infeed or self.inentry or self.insource
1153 context = self._getContext()
1154 context.setdefault('links', [])
1155 context['links'].append(FeedParserDict(attrsD))
1156 if attrsD['rel'] == 'enclosure':
1157 self._start_enclosure(attrsD)
1158 if attrsD.has_key('href'):
1160 if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1161 context['link'] = attrsD['href']
1163 self.push('link', expectingText)
1164 _start_producturl = _start_link
1166 def _end_link(self):
1167 value = self.pop('link')
1168 context = self._getContext()
1169 if self.intextinput:
1170 context['textinput']['link'] = value
1172 context['image']['link'] = value
1173 _end_producturl = _end_link
1175 def _start_guid(self, attrsD):
1176 self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1179 def _end_guid(self):
1180 value = self.pop('id')
1181 self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1183 # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1184 # and only if the item doesn't already have a link element
1185 self._save('link', value)
1187 def _start_title(self, attrsD):
1188 self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1189 _start_dc_title = _start_title
1190 _start_media_title = _start_title
1192 def _end_title(self):
1193 value = self.popContent('title')
1194 context = self._getContext()
1195 if self.intextinput:
1196 context['textinput']['title'] = value
1198 context['image']['title'] = value
1199 _end_dc_title = _end_title
1200 _end_media_title = _end_title
1202 def _start_description(self, attrsD):
1203 context = self._getContext()
1204 if context.has_key('summary'):
1205 self._summaryKey = 'content'
1206 self._start_content(attrsD)
1208 self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1210 def _start_abstract(self, attrsD):
1211 self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1213 def _end_description(self):
1214 if self._summaryKey == 'content':
1217 value = self.popContent('description')
1218 context = self._getContext()
1219 if self.intextinput:
1220 context['textinput']['description'] = value
1222 context['image']['description'] = value
1223 self._summaryKey = None
1224 _end_abstract = _end_description
1226 def _start_info(self, attrsD):
1227 self.pushContent('info', attrsD, 'text/plain', 1)
1228 _start_feedburner_browserfriendly = _start_info
1230 def _end_info(self):
1231 self.popContent('info')
1232 _end_feedburner_browserfriendly = _end_info
1234 def _start_generator(self, attrsD):
1236 attrsD = self._itsAnHrefDamnIt(attrsD)
1237 if attrsD.has_key('href'):
1238 attrsD['href'] = self.resolveURI(attrsD['href'])
1239 self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1240 self.push('generator', 1)
1242 def _end_generator(self):
1243 value = self.pop('generator')
1244 context = self._getContext()
1245 if context.has_key('generator_detail'):
1246 context['generator_detail']['name'] = value
1248 def _start_admin_generatoragent(self, attrsD):
1249 self.push('generator', 1)
1250 value = self._getAttribute(attrsD, 'rdf:resource')
1252 self.elementstack[-1][2].append(value)
1253 self.pop('generator')
1254 self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1256 def _start_admin_errorreportsto(self, attrsD):
1257 self.push('errorreportsto', 1)
1258 value = self._getAttribute(attrsD, 'rdf:resource')
1260 self.elementstack[-1][2].append(value)
1261 self.pop('errorreportsto')
1263 def _start_summary(self, attrsD):
1264 context = self._getContext()
1265 if context.has_key('summary'):
1266 self._summaryKey = 'content'
1267 self._start_content(attrsD)
1269 self._summaryKey = 'summary'
1270 self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1271 _start_itunes_summary = _start_summary
1273 def _end_summary(self):
1274 if self._summaryKey == 'content':
1277 self.popContent(self._summaryKey or 'summary')
1278 self._summaryKey = None
1279 _end_itunes_summary = _end_summary
1281 def _start_enclosure(self, attrsD):
1282 attrsD = self._itsAnHrefDamnIt(attrsD)
1283 self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
1284 href = attrsD.get('href')
1286 context = self._getContext()
1287 if not context.get('id'):
1288 context['id'] = href
1290 def _start_source(self, attrsD):
1293 def _end_source(self):
1295 self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1296 self.sourcedata.clear()
1298 def _start_content(self, attrsD):
1299 self.pushContent('content', attrsD, 'text/plain', 1)
1300 src = attrsD.get('src')
1302 self.contentparams['src'] = src
1303 self.push('content', 1)
1305 def _start_prodlink(self, attrsD):
1306 self.pushContent('content', attrsD, 'text/html', 1)
1308 def _start_body(self, attrsD):
1309 self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1310 _start_xhtml_body = _start_body
1312 def _start_content_encoded(self, attrsD):
1313 self.pushContent('content', attrsD, 'text/html', 1)
1314 _start_fullitem = _start_content_encoded
1316 def _end_content(self):
1317 copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1318 value = self.popContent('content')
1319 if copyToDescription:
1320 self._save('description', value)
1321 _end_body = _end_content
1322 _end_xhtml_body = _end_content
1323 _end_content_encoded = _end_content
1324 _end_fullitem = _end_content
1325 _end_prodlink = _end_content
1327 def _start_itunes_image(self, attrsD):
1328 self.push('itunes_image', 0)
1329 self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1330 _start_itunes_link = _start_itunes_image
1332 def _end_itunes_block(self):
1333 value = self.pop('itunes_block', 0)
1334 self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1336 def _end_itunes_explicit(self):
1337 value = self.pop('itunes_explicit', 0)
1338 self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
1341 class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1342 def __init__(self, baseuri, baselang, encoding):
1343 if _debug: sys.stderr.write('trying StrictFeedParser\n')
1344 xml.sax.handler.ContentHandler.__init__(self)
1345 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1349 def startPrefixMapping(self, prefix, uri):
1350 self.trackNamespace(prefix, uri)
1352 def startElementNS(self, name, qname, attrs):
1353 namespace, localname = name
1354 lowernamespace = str(namespace or '').lower()
1355 if lowernamespace.find('backend.userland.com/rss') <> -1:
1356 # match any backend.userland.com namespace
1357 namespace = 'http://backend.userland.com/rss'
1358 lowernamespace = namespace
1359 if qname and qname.find(':') > 0:
1360 givenprefix = qname.split(':')[0]
1363 prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1364 if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1365 raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1367 localname = prefix + ':' + localname
1368 localname = str(localname).lower()
1369 if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1371 # qname implementation is horribly broken in Python 2.1 (it
1372 # doesn't report any), and slightly broken in Python 2.2 (it
1373 # doesn't report the xml: namespace). So we match up namespaces
1374 # with a known list first, and then possibly override them with
1375 # the qnames the SAX parser gives us (if indeed it gives us any
1376 # at all). Thanks to MatejC for helping me test this and
1377 # tirelessly telling me that it didn't work yet.
1379 for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1380 lowernamespace = (namespace or '').lower()
1381 prefix = self._matchnamespaces.get(lowernamespace, '')
1383 attrlocalname = prefix + ':' + attrlocalname
1384 attrsD[str(attrlocalname).lower()] = attrvalue
1385 for qname in attrs.getQNames():
1386 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1387 self.unknown_starttag(localname, attrsD.items())
1389 def characters(self, text):
1390 self.handle_data(text)
1392 def endElementNS(self, name, qname):
1393 namespace, localname = name
1394 lowernamespace = str(namespace or '').lower()
1395 if qname and qname.find(':') > 0:
1396 givenprefix = qname.split(':')[0]
1399 prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1401 localname = prefix + ':' + localname
1402 localname = str(localname).lower()
1403 self.unknown_endtag(localname)
1405 def error(self, exc):
1409 def fatalError(self, exc):
1413 class _BaseHTMLProcessor(sgmllib.SGMLParser):
1414 elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1415 'img', 'input', 'isindex', 'link', 'meta', 'param']
1417 def __init__(self, encoding):
1418 self.encoding = encoding
1419 if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1420 sgmllib.SGMLParser.__init__(self)
1424 sgmllib.SGMLParser.reset(self)
1426 def _shorttag_replace(self, match):
1427 tag = match.group(1)
1428 if tag in self.elements_no_end_tag:
1429 return '<' + tag + ' />'
1431 return '<' + tag + '></' + tag + '>'
1433 def feed(self, data):
1434 data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
1435 #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1436 data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
1437 data = data.replace(''', "'")
1438 data = data.replace('"', '"')
1439 if self.encoding and type(data) == type(u''):
1440 data = data.encode(self.encoding)
1441 sgmllib.SGMLParser.feed(self, data)
1443 def normalize_attrs(self, attrs):
1444 # utility method to be called by descendants
1445 attrs = [(k.lower(), v) for k, v in attrs]
1446 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1449 def unknown_starttag(self, tag, attrs):
1450 # called for each start tag
1451 # attrs is a list of (attr, value) tuples
1452 # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1453 if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1455 # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1456 for key, value in attrs:
1457 if type(value) != type(u''):
1458 value = unicode(value, self.encoding)
1459 uattrs.append((unicode(key, self.encoding), value))
1460 strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
1461 if tag in self.elements_no_end_tag:
1462 self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1464 self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1466 def unknown_endtag(self, tag):
1467 # called for each end tag, e.g. for </pre>, tag will be 'pre'
1468 # Reconstruct the original end tag.
1469 if tag not in self.elements_no_end_tag:
1470 self.pieces.append("</%(tag)s>" % locals())
1472 def handle_charref(self, ref):
1473 # called for each character reference, e.g. for ' ', ref will be '160'
1474 # Reconstruct the original character reference.
1475 self.pieces.append('&#%(ref)s;' % locals())
1477 def handle_entityref(self, ref):
1478 # called for each entity reference, e.g. for '©', ref will be 'copy'
1479 # Reconstruct the original entity reference.
1480 self.pieces.append('&%(ref)s;' % locals())
1482 def handle_data(self, text):
1483 # called for each block of plain text, i.e. outside of any tag and
1484 # not containing any character or entity references
1485 # Store the original text verbatim.
1486 if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1487 self.pieces.append(text)
1489 def handle_comment(self, text):
1490 # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1491 # Reconstruct the original comment.
1492 self.pieces.append('<!--%(text)s-->' % locals())
1494 def handle_pi(self, text):
1495 # called for each processing instruction, e.g. <?instruction>
1496 # Reconstruct original processing instruction.
1497 self.pieces.append('<?%(text)s>' % locals())
1499 def handle_decl(self, text):
1500 # called for the DOCTYPE, if present, e.g.
1501 # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1502 # "http://www.w3.org/TR/html4/loose.dtd">
1503 # Reconstruct original DOCTYPE
1504 self.pieces.append('<!%(text)s>' % locals())
1506 _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1507 def _scan_name(self, i, declstartpos):
1508 rawdata = self.rawdata
1512 m = self._new_declname_match(rawdata, i)
1516 if (i + len(s)) == n:
1517 return None, -1 # end of buffer
1518 return name.lower(), m.end()
1520 self.handle_data(rawdata)
1521 # self.updatepos(declstartpos, i)
1525 '''Return processed HTML as a single string'''
1526 return ''.join([str(p) for p in self.pieces])
1528 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1529 def __init__(self, baseuri, baselang, encoding):
1530 sgmllib.SGMLParser.__init__(self)
1531 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1533 def decodeEntities(self, element, data):
1534 data = data.replace('<', '<')
1535 data = data.replace('<', '<')
1536 data = data.replace('>', '>')
1537 data = data.replace('>', '>')
1538 data = data.replace('&', '&')
1539 data = data.replace('&', '&')
1540 data = data.replace('"', '"')
1541 data = data.replace('"', '"')
1542 data = data.replace(''', ''')
1543 data = data.replace(''', ''')
1544 if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
1545 data = data.replace('<', '<')
1546 data = data.replace('>', '>')
1547 data = data.replace('&', '&')
1548 data = data.replace('"', '"')
1549 data = data.replace(''', "'")
1552 class _RelativeURIResolver(_BaseHTMLProcessor):
1553 relative_uris = [('a', 'href'),
1554 ('applet', 'codebase'),
1556 ('blockquote', 'cite'),
1557 ('body', 'background'),
1560 ('frame', 'longdesc'),
1562 ('iframe', 'longdesc'),
1564 ('head', 'profile'),
1565 ('img', 'longdesc'),
1569 ('input', 'usemap'),
1572 ('object', 'classid'),
1573 ('object', 'codebase'),
1575 ('object', 'usemap'),
1579 def __init__(self, baseuri, encoding):
1580 _BaseHTMLProcessor.__init__(self, encoding)
1581 self.baseuri = baseuri
1583 def resolveURI(self, uri):
1584 return _urljoin(self.baseuri, uri)
1586 def unknown_starttag(self, tag, attrs):
1587 attrs = self.normalize_attrs(attrs)
1588 attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1589 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1591 def _resolveRelativeURIs(htmlSource, baseURI, encoding):
1592 if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
1593 p = _RelativeURIResolver(baseURI, encoding)
1597 class _HTMLSanitizer(_BaseHTMLProcessor):
1598 acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1599 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1600 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1601 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1602 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1603 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1604 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1605 'thead', 'tr', 'tt', 'u', 'ul', 'var']
1607 acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1608 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1609 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1610 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1611 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1612 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1613 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1614 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1615 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
1616 'usemap', 'valign', 'value', 'vspace', 'width']
1618 unacceptable_elements_with_end_tag = ['script', 'applet']
1621 _BaseHTMLProcessor.reset(self)
1622 self.unacceptablestack = 0
1624 def unknown_starttag(self, tag, attrs):
1625 if not tag in self.acceptable_elements:
1626 if tag in self.unacceptable_elements_with_end_tag:
1627 self.unacceptablestack += 1
1629 attrs = self.normalize_attrs(attrs)
1630 attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1631 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1633 def unknown_endtag(self, tag):
1634 if not tag in self.acceptable_elements:
1635 if tag in self.unacceptable_elements_with_end_tag:
1636 self.unacceptablestack -= 1
1638 _BaseHTMLProcessor.unknown_endtag(self, tag)
1640 def handle_pi(self, text):
1643 def handle_decl(self, text):
1646 def handle_data(self, text):
1647 if not self.unacceptablestack:
1648 _BaseHTMLProcessor.handle_data(self, text)
1650 def _sanitizeHTML(htmlSource, encoding):
1651 p = _HTMLSanitizer(encoding)
1655 # loop through list of preferred Tidy interfaces looking for one that's installed,
1656 # then set up a common _tidy function to wrap the interface-specific API.
1658 for tidy_interface in PREFERRED_TIDY_INTERFACES:
1660 if tidy_interface == "uTidy":
1661 from tidy import parseString as _utidy
1662 def _tidy(data, **kwargs):
1663 return str(_utidy(data, **kwargs))
1665 elif tidy_interface == "mxTidy":
1666 from mx.Tidy import Tidy as _mxtidy
1667 def _tidy(data, **kwargs):
1668 nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
1674 utf8 = type(data) == type(u'')
1676 data = data.encode('utf-8')
1677 data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
1679 data = unicode(data, 'utf-8')
1680 if data.count('<body'):
1681 data = data.split('<body', 1)[1]
1683 data = data.split('>', 1)[1]
1684 if data.count('</body'):
1685 data = data.split('</body', 1)[0]
1686 data = data.strip().replace('\r\n', '\n')
1689 class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1690 def http_error_default(self, req, fp, code, msg, headers):
1691 if ((code / 100) == 3) and (code != 304):
1692 return self.http_error_302(req, fp, code, msg, headers)
1693 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1694 infourl.status = code
1697 def http_error_302(self, req, fp, code, msg, headers):
1698 if headers.dict.has_key('location'):
1699 infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1701 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1702 if not hasattr(infourl, 'status'):
1703 infourl.status = code
1706 def http_error_301(self, req, fp, code, msg, headers):
1707 if headers.dict.has_key('location'):
1708 infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1710 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1711 if not hasattr(infourl, 'status'):
1712 infourl.status = code
1715 http_error_300 = http_error_302
1716 http_error_303 = http_error_302
1717 http_error_307 = http_error_302
1719 def http_error_401(self, req, fp, code, msg, headers):
1721 # - server requires digest auth, AND
1722 # - we tried (unsuccessfully) with basic auth, AND
1723 # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
1724 # If all conditions hold, parse authentication information
1725 # out of the Authorization header we sent the first time
1726 # (for the username and password) and the WWW-Authenticate
1727 # header the server sent back (for the realm) and retry
1728 # the request with the appropriate digest auth headers instead.
1729 # This evil genius hack has been brought to you by Aaron Swartz.
1730 host = urlparse.urlparse(req.get_full_url())[1]
1732 assert sys.version.split()[0] >= '2.3.3'
1733 assert base64 != None
1734 user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
1735 realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
1736 self.add_password(realm, host, user, passw)
1737 retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
1738 self.reset_retry_count()
1741 return self.http_error_default(req, fp, code, msg, headers)
1743 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
1744 """URL, filename, or string --> stream
1746 This function lets you define parsers that take any input source
1747 (URL, pathname to local or network file, or actual data as a string)
1748 and deal with it in a uniform manner. Returned object is guaranteed
1749 to have all the basic stdio read methods (read, readline, readlines).
1750 Just .close() the object when you're done with it.
1752 If the etag argument is supplied, it will be used as the value of an
1753 If-None-Match request header.
1755 If the modified argument is supplied, it must be a tuple of 9 integers
1756 as returned by gmtime() in the standard Python time module. This MUST
1757 be in GMT (Greenwich Mean Time). The formatted date/time will be used
1758 as the value of an If-Modified-Since request header.
1760 If the agent argument is supplied, it will be used as the value of a
1761 User-Agent request header.
1763 If the referrer argument is supplied, it will be used as the value of a
1764 Referer[sic] request header.
1766 If handlers is supplied, it is a list of handlers used to build a
1770 if hasattr(url_file_stream_or_string, 'read'):
1771 return url_file_stream_or_string
1773 if url_file_stream_or_string == '-':
1776 if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
1779 # test for inline user:password for basic auth
1782 urltype, rest = urllib.splittype(url_file_stream_or_string)
1783 realhost, rest = urllib.splithost(rest)
1785 user_passwd, realhost = urllib.splituser(realhost)
1787 url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
1788 auth = base64.encodestring(user_passwd).strip()
1789 # try to open with urllib2 (to use optional headers)
1790 request = urllib2.Request(url_file_stream_or_string)
1791 request.add_header('User-Agent', agent)
1793 request.add_header('If-None-Match', etag)
1795 # format into an RFC 1123-compliant timestamp. We can't use
1796 # time.strftime() since the %a and %b directives can be affected
1797 # by the current locale, but RFC 2616 states that dates must be
1799 short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
1800 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
1801 request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
1803 request.add_header('Referer', referrer)
1805 request.add_header('Accept-encoding', 'gzip, deflate')
1807 request.add_header('Accept-encoding', 'gzip')
1809 request.add_header('Accept-encoding', 'deflate')
1811 request.add_header('Accept-encoding', '')
1813 request.add_header('Authorization', 'Basic %s' % auth)
1815 request.add_header('Accept', ACCEPT_HEADER)
1816 request.add_header('A-IM', 'feed') # RFC 3229 support
1817 opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
1818 opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
1820 return opener.open(request)
1822 opener.close() # JohnD
1824 # try to open with native open function (if url_file_stream_or_string is a filename)
1826 return open(url_file_stream_or_string)
1830 # treat url_file_stream_or_string as string
1831 return _StringIO(str(url_file_stream_or_string))
1834 def registerDateHandler(func):
1835 '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
1836 _date_handlers.insert(0, func)
1838 # ISO-8601 date parsing routines written by Fazal Majid.
1839 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
1840 # parser is beyond the scope of feedparser and would be a worthwhile addition
1841 # to the Python library.
1842 # A single regular expression cannot parse ISO 8601 date formats into groups
1843 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
1844 # 0301-04-01), so we use templates instead.
1845 # Please note the order in templates is significant because we need a
1847 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
1848 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
1849 '-YY-?MM', '-OOO', '-YY',
1855 'YYYY', r'(?P<year>\d{4})').replace(
1856 'YY', r'(?P<year>\d\d)').replace(
1857 'MM', r'(?P<month>[01]\d)').replace(
1858 'DD', r'(?P<day>[0123]\d)').replace(
1859 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
1860 'CC', r'(?P<century>\d\d$)')
1861 + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
1862 + r'(:(?P<second>\d{2}))?'
1863 + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
1864 for tmpl in _iso8601_tmpl]
1866 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
1868 def _parse_date_iso8601(dateString):
1869 '''Parse a variety of ISO-8601-compatible formats like 20040105'''
1871 for _iso8601_match in _iso8601_matches:
1872 m = _iso8601_match(dateString)
1875 if m.span() == (0, 0): return
1876 params = m.groupdict()
1877 ordinal = params.get('ordinal', 0)
1879 ordinal = int(ordinal)
1882 year = params.get('year', '--')
1883 if not year or year == '--':
1884 year = time.gmtime()[0]
1885 elif len(year) == 2:
1886 # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
1887 year = 100 * int(time.gmtime()[0] / 100) + int(year)
1890 month = params.get('month', '-')
1891 if not month or month == '-':
1892 # ordinals are NOT normalized by mktime, we simulate them
1893 # by setting month=1, day=ordinal
1897 month = time.gmtime()[1]
1899 day = params.get('day', 0)
1904 elif params.get('century', 0) or \
1905 params.get('year', 0) or params.get('month', 0):
1908 day = time.gmtime()[2]
1911 # special case of the century - is the first year of the 21st century
1912 # 2000 or 2001 ? The debate goes on...
1913 if 'century' in params.keys():
1914 year = (int(params['century']) - 1) * 100 + 1
1915 # in ISO 8601 most fields are optional
1916 for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
1917 if not params.get(field, None):
1919 hour = int(params.get('hour', 0))
1920 minute = int(params.get('minute', 0))
1921 second = int(params.get('second', 0))
1922 # weekday is normalized by mktime(), we can ignore it
1924 # daylight savings is complex, but not needed for feedparser's purposes
1925 # as time zones, if specified, include mention of whether it is active
1926 # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
1927 # and most implementations have DST bugs
1928 daylight_savings_flag = 0
1929 tm = [year, month, day, hour, minute, second, weekday,
1930 ordinal, daylight_savings_flag]
1931 # ISO 8601 time zone adjustments
1932 tz = params.get('tz')
1933 if tz and tz != 'Z':
1935 tm[3] += int(params.get('tzhour', 0))
1936 tm[4] += int(params.get('tzmin', 0))
1938 tm[3] -= int(params.get('tzhour', 0))
1939 tm[4] -= int(params.get('tzmin', 0))
1942 # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
1943 # which is guaranteed to normalize d/m/y/h/m/s.
1944 # Many implementations have bugs, but we'll pretend they don't.
1945 return time.localtime(time.mktime(tm))
1946 registerDateHandler(_parse_date_iso8601)
1948 # 8-bit date handling routines written by ytrewq1.
1949 _korean_year = u'\ub144' # b3e2 in euc-kr
1950 _korean_month = u'\uc6d4' # bff9 in euc-kr
1951 _korean_day = u'\uc77c' # c0cf in euc-kr
1952 _korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
1953 _korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
1955 _korean_onblog_date_re = \
1956 re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
1957 (_korean_year, _korean_month, _korean_day))
1958 _korean_nate_date_re = \
1959 re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
1960 (_korean_am, _korean_pm))
1961 def _parse_date_onblog(dateString):
1962 '''Parse a string according to the OnBlog 8-bit date format'''
1963 m = _korean_onblog_date_re.match(dateString)
1965 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1966 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1967 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
1968 'zonediff': '+09:00'}
1969 if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
1970 return _parse_date_w3dtf(w3dtfdate)
1971 registerDateHandler(_parse_date_onblog)
1973 def _parse_date_nate(dateString):
1974 '''Parse a string according to the Nate 8-bit date format'''
1975 m = _korean_nate_date_re.match(dateString)
1977 hour = int(m.group(5))
1979 if (ampm == _korean_pm):
1984 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1985 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1986 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
1987 'zonediff': '+09:00'}
1988 if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
1989 return _parse_date_w3dtf(w3dtfdate)
1990 registerDateHandler(_parse_date_nate)
1993 re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
1994 def _parse_date_mssql(dateString):
1995 '''Parse a string according to the MS SQL date format'''
1996 m = _mssql_date_re.match(dateString)
1998 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1999 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2000 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2001 'zonediff': '+09:00'}
2002 if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
2003 return _parse_date_w3dtf(w3dtfdate)
2004 registerDateHandler(_parse_date_mssql)
2006 # Unicode strings for Greek date strings
2009 u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
2010 u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
2011 u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
2012 u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
2013 u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
2014 u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
2015 u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
2016 u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
2017 u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
2018 u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
2019 u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
2020 u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
2021 u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
2022 u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
2023 u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
2024 u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
2025 u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
2026 u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
2027 u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
2032 u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
2033 u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
2034 u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
2035 u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
2036 u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
2037 u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
2038 u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
2041 _greek_date_format_re = \
2042 re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
2044 def _parse_date_greek(dateString):
2045 '''Parse a string according to a Greek 8-bit date format.'''
2046 m = _greek_date_format_re.match(dateString)
2049 wday = _greek_wdays[m.group(1)]
2050 month = _greek_months[m.group(3)]
2053 rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
2054 {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
2055 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
2056 'zonediff': m.group(8)}
2057 if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
2058 return _parse_date_rfc822(rfc822date)
2059 registerDateHandler(_parse_date_greek)
2061 # Unicode strings for Hungarian date strings
2062 _hungarian_months = \
2064 u'janu\u00e1r': u'01', # e1 in iso-8859-2
2065 u'febru\u00e1ri': u'02', # e1 in iso-8859-2
2066 u'm\u00e1rcius': u'03', # e1 in iso-8859-2
2067 u'\u00e1prilis': u'04', # e1 in iso-8859-2
2068 u'm\u00e1ujus': u'05', # e1 in iso-8859-2
2069 u'j\u00fanius': u'06', # fa in iso-8859-2
2070 u'j\u00falius': u'07', # fa in iso-8859-2
2071 u'augusztus': u'08',
2072 u'szeptember': u'09',
2073 u'okt\u00f3ber': u'10', # f3 in iso-8859-2
2078 _hungarian_date_format_re = \
2079 re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
2081 def _parse_date_hungarian(dateString):
2082 '''Parse a string according to a Hungarian 8-bit date format.'''
2083 m = _hungarian_date_format_re.match(dateString)
2086 month = _hungarian_months[m.group(2)]
2095 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
2096 {'year': m.group(1), 'month': month, 'day': day,\
2097 'hour': hour, 'minute': m.group(5),\
2098 'zonediff': m.group(6)}
2099 if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
2100 return _parse_date_w3dtf(w3dtfdate)
2101 registerDateHandler(_parse_date_hungarian)
2103 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
2104 # Drake and licensed under the Python license. Removed all range checking
2105 # for month, day, hour, minute, and second, since mktime will normalize
2107 def _parse_date_w3dtf(dateString):
2108 def __extract_date(m):
2109 year = int(m.group('year'))
2111 year = 100 * int(time.gmtime()[0] / 100) + int(year)
2114 julian = m.group('julian')
2116 julian = int(julian)
2117 month = julian / 30 + 1
2118 day = julian % 30 + 1
2120 while jday != julian:
2121 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
2122 jday = time.gmtime(t)[-2]
2123 diff = abs(jday - julian)
2135 return year, month, day
2136 month = m.group('month')
2142 day = m.group('day')
2147 return year, month, day
2149 def __extract_time(m):
2152 hours = m.group('hours')
2156 minutes = int(m.group('minutes'))
2157 seconds = m.group('seconds')
2159 seconds = int(seconds)
2162 return hours, minutes, seconds
2164 def __extract_tzd(m):
2165 '''Return the Time Zone Designator as an offset in seconds from UTC.'''
2168 tzd = m.group('tzd')
2173 hours = int(m.group('tzdhours'))
2174 minutes = m.group('tzdminutes')
2176 minutes = int(minutes)
2179 offset = (hours*60 + minutes) * 60
2184 __date_re = ('(?P<year>\d\d\d\d)'
2186 '(?:(?P<julian>\d\d\d)'
2187 '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
2188 __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
2189 __tzd_rx = re.compile(__tzd_re)
2190 __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
2191 '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
2193 __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
2194 __datetime_rx = re.compile(__datetime_re)
2195 m = __datetime_rx.match(dateString)
2196 if (m is None) or (m.group() != dateString): return
2197 gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
2198 if gmt[0] == 0: return
2199 return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
2200 registerDateHandler(_parse_date_w3dtf)
2202 def _parse_date_rfc822(dateString):
2203 '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
2204 data = dateString.split()
2205 if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
2211 data[3:] = [s[:i], s[i+1:]]
2214 dateString = " ".join(data)
2216 dateString += ' 00:00:00 GMT'
2217 tm = rfc822.parsedate_tz(dateString)
2219 return time.gmtime(rfc822.mktime_tz(tm))
2220 # rfc822.py defines several time zones, but we define some extra ones.
2221 # 'ET' is equivalent to 'EST', etc.
2222 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
2223 rfc822._timezones.update(_additional_timezones)
2224 registerDateHandler(_parse_date_rfc822)
2226 def _parse_date(dateString):
2227 '''Parses a variety of date formats into a 9-tuple in GMT'''
2228 for handler in _date_handlers:
2230 date9tuple = handler(dateString)
2231 if not date9tuple: continue
2232 if len(date9tuple) != 9:
2233 if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
2235 map(int, date9tuple)
2237 except Exception, e:
2238 if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
2242 def _getCharacterEncoding(http_headers, xml_data):
2243 '''Get the character encoding of the XML document
2245 http_headers is a dictionary
2246 xml_data is a raw string (not Unicode)
2248 This is so much trickier than it sounds, it's not even funny.
2249 According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
2250 is application/xml, application/*+xml,
2251 application/xml-external-parsed-entity, or application/xml-dtd,
2252 the encoding given in the charset parameter of the HTTP Content-Type
2253 takes precedence over the encoding given in the XML prefix within the
2254 document, and defaults to 'utf-8' if neither are specified. But, if
2255 the HTTP Content-Type is text/xml, text/*+xml, or
2256 text/xml-external-parsed-entity, the encoding given in the XML prefix
2257 within the document is ALWAYS IGNORED and only the encoding given in
2258 the charset parameter of the HTTP Content-Type header should be
2259 respected, and it defaults to 'us-ascii' if not specified.
2261 Furthermore, discussion on the atom-syntax mailing list with the
2262 author of RFC 3023 leads me to the conclusion that any document
2263 served with a Content-Type of text/* and no charset parameter
2264 must be treated as us-ascii. (We now do this.) And also that it
2265 must always be flagged as non-well-formed. (We now do this too.)
2267 If Content-Type is unspecified (input was local file or non-HTTP source)
2268 or unrecognized (server just got it totally wrong), then go by the
2269 encoding given in the XML prefix of the document and default to
2270 'iso-8859-1' as per the HTTP specification (RFC 2616).
2272 Then, assuming we didn't find a character encoding in the HTTP headers
2273 (and the HTTP Content-type allowed us to look in the body), we need
2274 to sniff the first few bytes of the XML data and try to determine
2275 whether the encoding is ASCII-compatible. Section F of the XML
2276 specification shows the way here:
2277 http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2279 If the sniffed encoding is not ASCII-compatible, we need to make it
2280 ASCII compatible so that we can sniff further into the XML declaration
2281 to find the encoding attribute, which will tell us the true encoding.
2283 Of course, none of this guarantees that we will be able to parse the
2284 feed in the declared character encoding (assuming it was declared
2285 correctly, which many are not). CJKCodecs and iconv_codec help a lot;
2286 you should definitely install them if you can.
2287 http://cjkpython.i18n.org/
2290 def _parseHTTPContentType(content_type):
2291 '''takes HTTP Content-Type header and returns (content type, charset)
2293 If no charset is specified, returns (content type, '')
2294 If no content type is specified, returns ('', '')
2295 Both return parameters are guaranteed to be lowercase strings
2297 content_type = content_type or ''
2298 content_type, params = cgi.parse_header(content_type)
2299 return content_type, params.get('charset', '').replace("'", '')
2301 sniffed_xml_encoding = ''
2304 http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
2305 # Must sniff for non-ASCII-compatible character encodings before
2306 # searching for XML declaration. This heuristic is defined in
2307 # section F of the XML specification:
2308 # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2310 if xml_data[:4] == '\x4c\x6f\xa7\x94':
2312 xml_data = _ebcdic_to_ascii(xml_data)
2313 elif xml_data[:4] == '\x00\x3c\x00\x3f':
2315 sniffed_xml_encoding = 'utf-16be'
2316 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
2317 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
2319 sniffed_xml_encoding = 'utf-16be'
2320 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
2321 elif xml_data[:4] == '\x3c\x00\x3f\x00':
2323 sniffed_xml_encoding = 'utf-16le'
2324 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
2325 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
2327 sniffed_xml_encoding = 'utf-16le'
2328 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
2329 elif xml_data[:4] == '\x00\x00\x00\x3c':
2331 sniffed_xml_encoding = 'utf-32be'
2332 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
2333 elif xml_data[:4] == '\x3c\x00\x00\x00':
2335 sniffed_xml_encoding = 'utf-32le'
2336 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
2337 elif xml_data[:4] == '\x00\x00\xfe\xff':
2339 sniffed_xml_encoding = 'utf-32be'
2340 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
2341 elif xml_data[:4] == '\xff\xfe\x00\x00':
2343 sniffed_xml_encoding = 'utf-32le'
2344 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
2345 elif xml_data[:3] == '\xef\xbb\xbf':
2347 sniffed_xml_encoding = 'utf-8'
2348 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
2352 xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
2354 xml_encoding_match = None
2355 if xml_encoding_match:
2356 xml_encoding = xml_encoding_match.groups()[0].lower()
2357 if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
2358 xml_encoding = sniffed_xml_encoding
2359 acceptable_content_type = 0
2360 application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
2361 text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
2362 if (http_content_type in application_content_types) or \
2363 (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
2364 acceptable_content_type = 1
2365 true_encoding = http_encoding or xml_encoding or 'utf-8'
2366 elif (http_content_type in text_content_types) or \
2367 (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
2368 acceptable_content_type = 1
2369 true_encoding = http_encoding or 'us-ascii'
2370 elif http_content_type.startswith('text/'):
2371 true_encoding = http_encoding or 'us-ascii'
2372 elif http_headers and (not http_headers.has_key('content-type')):
2373 true_encoding = xml_encoding or 'iso-8859-1'
2375 true_encoding = xml_encoding or 'utf-8'
2376 return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
2378 def _toUTF8(data, encoding):
2379 '''Changes an XML data stream on the fly to specify a new encoding
2381 data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
2382 encoding is a string recognized by encodings.aliases
2384 if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
2385 # strip Byte Order Mark (if present)
2386 if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
2388 sys.stderr.write('stripping BOM\n')
2389 if encoding != 'utf-16be':
2390 sys.stderr.write('trying utf-16be instead\n')
2391 encoding = 'utf-16be'
2393 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
2395 sys.stderr.write('stripping BOM\n')
2396 if encoding != 'utf-16le':
2397 sys.stderr.write('trying utf-16le instead\n')
2398 encoding = 'utf-16le'
2400 elif data[:3] == '\xef\xbb\xbf':
2402 sys.stderr.write('stripping BOM\n')
2403 if encoding != 'utf-8':
2404 sys.stderr.write('trying utf-8 instead\n')
2407 elif data[:4] == '\x00\x00\xfe\xff':
2409 sys.stderr.write('stripping BOM\n')
2410 if encoding != 'utf-32be':
2411 sys.stderr.write('trying utf-32be instead\n')
2412 encoding = 'utf-32be'
2414 elif data[:4] == '\xff\xfe\x00\x00':
2416 sys.stderr.write('stripping BOM\n')
2417 if encoding != 'utf-32le':
2418 sys.stderr.write('trying utf-32le instead\n')
2419 encoding = 'utf-32le'
2421 newdata = unicode(data, encoding)
2422 if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
2423 declmatch = re.compile('^<\?xml[^>]*?>')
2424 newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
2425 if declmatch.search(newdata):
2426 newdata = declmatch.sub(newdecl, newdata)
2428 newdata = newdecl + u'\n' + newdata
2429 return newdata.encode('utf-8')
2431 def _stripDoctype(data):
2432 '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
2434 rss_version may be 'rss091n' or None
2435 stripped_data is the same XML document, minus the DOCTYPE
2437 entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
2438 data = entity_pattern.sub('', data)
2439 doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
2440 doctype_results = doctype_pattern.findall(data)
2441 doctype = doctype_results and doctype_results[0] or ''
2442 if doctype.lower().count('netscape'):
2446 data = doctype_pattern.sub('', data)
2447 return version, data
2449 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
2450 '''Parse a feed from a URL, file, stream, or string'''
2451 result = FeedParserDict()
2452 result['feed'] = FeedParserDict()
2453 result['entries'] = []
2456 if type(handlers) == types.InstanceType:
2457 handlers = [handlers]
2459 f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
2461 except Exception, e:
2463 result['bozo_exception'] = e
2467 # if feed is gzip-compressed, decompress it
2468 if f and data and hasattr(f, 'headers'):
2469 if gzip and f.headers.get('content-encoding', '') == 'gzip':
2471 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
2472 except Exception, e:
2473 # Some feeds claim to be gzipped but they're not, so
2474 # we get garbage. Ideally, we should re-request the
2475 # feed without the 'Accept-encoding: gzip' header,
2478 result['bozo_exception'] = e
2480 elif zlib and f.headers.get('content-encoding', '') == 'deflate':
2482 data = zlib.decompress(data, -zlib.MAX_WBITS)
2483 except Exception, e:
2485 result['bozo_exception'] = e
2489 if hasattr(f, 'info'):
2491 result['etag'] = info.getheader('ETag')
2492 last_modified = info.getheader('Last-Modified')
2494 result['modified'] = _parse_date(last_modified)
2495 if hasattr(f, 'url'):
2496 result['href'] = f.url
2497 result['status'] = 200
2498 if hasattr(f, 'status'):
2499 result['status'] = f.status
2500 if hasattr(f, 'headers'):
2501 result['headers'] = f.headers.dict
2502 if hasattr(f, 'close'):
2505 # there are four encodings to keep track of:
2506 # - http_encoding is the encoding declared in the Content-Type HTTP header
2507 # - xml_encoding is the encoding declared in the <?xml declaration
2508 # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
2509 # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
2510 http_headers = result.get('headers', {})
2511 result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
2512 _getCharacterEncoding(http_headers, data)
2513 if http_headers and (not acceptable_content_type):
2514 if http_headers.has_key('content-type'):
2515 bozo_message = '%s is not an XML media type' % http_headers['content-type']
2517 bozo_message = 'no Content-type specified'
2519 result['bozo_exception'] = NonXMLContentType(bozo_message)
2521 result['version'], data = _stripDoctype(data)
2523 baseuri = http_headers.get('content-location', result.get('href'))
2524 baselang = http_headers.get('content-language', None)
2526 # if server sent 304, we're done
2527 if result.get('status', 0) == 304:
2528 result['version'] = ''
2529 result['debug_message'] = 'The feed has not changed since you last checked, ' + \
2530 'so the server sent no data. This is a feature, not a bug!'
2533 # if there was a problem downloading, we're done
2537 # determine character encoding
2538 use_strict_parser = 0
2540 tried_encodings = []
2541 # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
2542 for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
2543 if not proposed_encoding: continue
2544 if proposed_encoding in tried_encodings: continue
2545 tried_encodings.append(proposed_encoding)
2547 data = _toUTF8(data, proposed_encoding)
2548 known_encoding = use_strict_parser = 1
2552 # if no luck and we have auto-detection library, try that
2553 if (not known_encoding) and chardet:
2555 proposed_encoding = chardet.detect(data)['encoding']
2556 if proposed_encoding and (proposed_encoding not in tried_encodings):
2557 tried_encodings.append(proposed_encoding)
2558 data = _toUTF8(data, proposed_encoding)
2559 known_encoding = use_strict_parser = 1
2562 # if still no luck and we haven't tried utf-8 yet, try that
2563 if (not known_encoding) and ('utf-8' not in tried_encodings):
2565 proposed_encoding = 'utf-8'
2566 tried_encodings.append(proposed_encoding)
2567 data = _toUTF8(data, proposed_encoding)
2568 known_encoding = use_strict_parser = 1
2571 # if still no luck and we haven't tried windows-1252 yet, try that
2572 if (not known_encoding) and ('windows-1252' not in tried_encodings):
2574 proposed_encoding = 'windows-1252'
2575 tried_encodings.append(proposed_encoding)
2576 data = _toUTF8(data, proposed_encoding)
2577 known_encoding = use_strict_parser = 1
2580 # if still no luck, give up
2581 if not known_encoding:
2583 result['bozo_exception'] = CharacterEncodingUnknown( \
2584 'document encoding unknown, I tried ' + \
2585 '%s, %s, utf-8, and windows-1252 but nothing worked' % \
2586 (result['encoding'], xml_encoding))
2587 result['encoding'] = ''
2588 elif proposed_encoding != result['encoding']:
2590 result['bozo_exception'] = CharacterEncodingOverride( \
2591 'documented declared as %s, but parsed as %s' % \
2592 (result['encoding'], proposed_encoding))
2593 result['encoding'] = proposed_encoding
2595 if not _XML_AVAILABLE:
2596 use_strict_parser = 0
2597 if use_strict_parser:
2598 # initialize the SAX parser
2599 feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
2600 saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
2601 saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
2602 saxparser.setContentHandler(feedparser)
2603 saxparser.setErrorHandler(feedparser)
2604 source = xml.sax.xmlreader.InputSource()
2605 source.setByteStream(_StringIO(data))
2606 if hasattr(saxparser, '_ns_stack'):
2607 # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
2608 # PyXML doesn't have this problem, and it doesn't have _ns_stack either
2609 saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
2611 saxparser.parse(source)
2612 except Exception, e:
2615 traceback.print_stack()
2616 traceback.print_exc()
2617 sys.stderr.write('xml parsing failed\n')
2619 result['bozo_exception'] = feedparser.exc or e
2620 use_strict_parser = 0
2621 if not use_strict_parser:
2622 feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
2623 feedparser.feed(data)
2624 result['feed'] = feedparser.feeddata
2625 result['entries'] = feedparser.entries
2626 result['version'] = result['version'] or feedparser.version
2627 result['namespaces'] = feedparser.namespacesInUse
2630 if __name__ == '__main__':
2631 if not sys.argv[1:]:
2636 zopeCompatibilityHack()
2637 from pprint import pprint
2646 #1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
2647 # added Simon Fell's test suite
2648 #1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
2650 # JD - use inchannel to watch out for image and textinput elements which can
2651 # also contain title, link, and description elements
2652 # JD - check for isPermaLink='false' attribute on guid elements
2653 # JD - replaced openAnything with open_resource supporting ETag and
2654 # If-Modified-Since request headers
2655 # JD - parse now accepts etag, modified, agent, and referrer optional
2657 # JD - modified parse to return a dictionary instead of a tuple so that any
2658 # etag or modified information can be returned and cached by the caller
2659 #2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
2660 # because of etag/modified, return the old etag/modified to the caller to
2661 # indicate why nothing is being returned
2662 #2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
2663 # useless. Fixes the problem JD was addressing by adding it.
2664 #2.1 - 11/14/2002 - MAP - added gzip support
2665 #2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
2666 # start_admingeneratoragent is an example of how to handle elements with
2667 # only attributes, no content.
2668 #2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
2669 # also, make sure we send the User-Agent even if urllib2 isn't available.
2670 # Match any variation of backend.userland.com/rss namespace.
2671 #2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
2672 #2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
2673 # snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
2675 #2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
2676 # removed unnecessary urllib code -- urllib2 should always be available anyway;
2677 # return actual url, status, and full HTTP headers (as result['url'],
2678 # result['status'], and result['headers']) if parsing a remote feed over HTTP --
2679 # this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
2680 # added the latest namespace-of-the-week for RSS 2.0
2681 #2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
2682 # User-Agent (otherwise urllib2 sends two, which confuses some servers)
2683 #2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
2684 # inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
2685 #2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
2686 # textInput, and also to return the character encoding (if specified)
2687 #2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
2688 # nested divs within content (JohnD); fixed missing sys import (JohanS);
2689 # fixed regular expression to capture XML character encoding (Andrei);
2690 # added support for Atom 0.3-style links; fixed bug with textInput tracking;
2691 # added support for cloud (MartijnP); added support for multiple
2692 # category/dc:subject (MartijnP); normalize content model: 'description' gets
2693 # description (which can come from description, summary, or full content if no
2694 # description), 'content' gets dict of base/language/type/value (which can come
2695 # from content:encoded, xhtml:body, content, or fullitem);
2696 # fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
2697 # tracking; fixed bug tracking unknown tags; fixed bug tracking content when
2698 # <content> element is not in default namespace (like Pocketsoap feed);
2699 # resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
2700 # wfw:commentRSS; resolve relative URLs within embedded HTML markup in
2701 # description, xhtml:body, content, content:encoded, title, subtitle,
2702 # summary, info, tagline, and copyright; added support for pingback and
2703 # trackback namespaces
2704 #2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
2705 # namespaces, as opposed to 2.6 when I said I did but didn't really;
2706 # sanitize HTML markup within some elements; added mxTidy support (if
2707 # installed) to tidy HTML markup within some elements; fixed indentation
2708 # bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
2709 # (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
2710 # 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
2711 # 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
2712 # and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
2713 #2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory
2714 # leak not closing url opener (JohnD); added dc:publisher support (MarekK);
2715 # added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
2716 #2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
2717 # encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
2718 # fixed relative URI processing for guid (skadz); added ICBM support; added
2720 #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
2721 # blogspot.com sites); added _debug variable
2722 #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
2723 #3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
2724 # added several new supported namespaces; fixed bug tracking naked markup in
2725 # description; added support for enclosure; added support for source; re-added
2726 # support for cloud which got dropped somehow; added support for expirationDate
2727 #3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
2728 # xml:base URI, one for documents that don't define one explicitly and one for
2729 # documents that define an outer and an inner xml:base that goes out of scope
2730 # before the end of the document
2731 #3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
2732 #3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
2733 # will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
2734 # added support for creativeCommons:license and cc:license; added support for
2735 # full Atom content model in title, tagline, info, copyright, summary; fixed bug
2736 # with gzip encoding (not always telling server we support it when we do)
2737 #3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
2738 # (dictionary of 'name', 'url', 'email'); map author to author_detail if author
2739 # contains name + email address
2740 #3.0b8 - 1/28/2004 - MAP - added support for contributor
2741 #3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
2742 # support for summary
2743 #3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
2745 #3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
2746 # dangerous markup; fiddled with decodeEntities (not right); liberalized
2747 # date parsing even further
2748 #3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
2749 # added support to Atom 0.2 subtitle; added support for Atom content model
2750 # in copyright; better sanitizing of dangerous HTML elements with end tags
2751 # (script, frameset)
2752 #3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
2753 # etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
2754 #3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
2756 #3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
2757 # fixed bug capturing author and contributor URL; fixed bug resolving relative
2758 # links in author and contributor URL; fixed bug resolvin relative links in
2759 # generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
2760 # namespace tests, and included them permanently in the test suite with his
2761 # permission; fixed namespace handling under Python 2.1
2762 #3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
2763 #3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
2764 #3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
2765 # use libxml2 (if available)
2766 #3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
2767 # name was in parentheses; removed ultra-problematic mxTidy support; patch to
2768 # workaround crash in PyXML/expat when encountering invalid entities
2769 # (MarkMoraes); support for textinput/textInput
2770 #3.0b20 - 4/7/2004 - MAP - added CDF support
2771 #3.0b21 - 4/14/2004 - MAP - added Hot RSS support
2772 #3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
2773 # results dict; changed results dict to allow getting values with results.key
2774 # as well as results[key]; work around embedded illformed HTML with half
2775 # a DOCTYPE; work around malformed Content-Type header; if character encoding
2776 # is wrong, try several common ones before falling back to regexes (if this
2777 # works, bozo_exception is set to CharacterEncodingOverride); fixed character
2778 # encoding issues in BaseHTMLProcessor by tracking encoding and converting
2779 # from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
2780 # convert each value in results to Unicode (if possible), even if using
2781 # regex-based parsing
2782 #3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
2783 # high-bit characters in attributes in embedded HTML in description (thanks
2784 # Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
2785 # FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
2786 # about a mapped key
2787 #3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
2788 # results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
2789 # cause the same encoding to be tried twice (even if it failed the first time);
2790 # fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
2791 # better textinput and image tracking in illformed RSS 1.0 feeds
2792 #3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
2793 # my blink tag tests
2794 #3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
2795 # failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
2796 # duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
2797 # added support for image; refactored parse() fallback logic to try other
2798 # encodings if SAX parsing fails (previously it would only try other encodings
2799 # if re-encoding failed); remove unichr madness in normalize_attrs now that
2800 # we're properly tracking encoding in and out of BaseHTMLProcessor; set
2801 # feed.language from root-level xml:lang; set entry.id from rdf:about;
2802 # send Accept header
2803 #3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
2804 # iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
2805 # windows-1252); fixed regression that could cause the same encoding to be
2806 # tried twice (even if it failed the first time)
2807 #3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
2808 # recover from malformed content-type header parameter with no equals sign
2809 # ('text/xml; charset:iso-8859-1')
2810 #3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
2811 # to Unicode equivalents in illformed feeds (aaronsw); added and
2812 # passed tests for converting character entities to Unicode equivalents
2813 # in illformed feeds (aaronsw); test for valid parsers when setting
2814 # XML_AVAILABLE; make version and encoding available when server returns
2815 # a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
2816 # digest auth or proxy support); add code to parse username/password
2817 # out of url and send as basic authentication; expose downloading-related
2818 # exceptions in bozo_exception (aaronsw); added __contains__ method to
2819 # FeedParserDict (aaronsw); added publisher_detail (aaronsw)
2820 #3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
2821 # convert feed to UTF-8 before passing to XML parser; completely revamped
2822 # logic for determining character encoding and attempting XML parsing
2823 # (much faster); increased default timeout to 20 seconds; test for presence
2824 # of Location header on redirects; added tests for many alternate character
2825 # encodings; support various EBCDIC encodings; support UTF-16BE and
2826 # UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
2827 # UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
2828 # XML parsers are available; added support for 'Content-encoding: deflate';
2829 # send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
2831 #3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
2832 # problem tracking xml:base and xml:lang if element declares it, child
2833 # doesn't, first grandchild redeclares it, and second grandchild doesn't;
2834 # refactored date parsing; defined public registerDateHandler so callers
2835 # can add support for additional date formats at runtime; added support
2836 # for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
2837 # zopeCompatibilityHack() which turns FeedParserDict into a regular
2838 # dictionary, required for Zope compatibility, and also makes command-
2839 # line debugging easier because pprint module formats real dictionaries
2840 # better than dictionary-like objects; added NonXMLContentType exception,
2841 # which is stored in bozo_exception when a feed is served with a non-XML
2842 # media type such as 'text/plain'; respect Content-Language as default
2843 # language if not xml:lang is present; cloud dict is now FeedParserDict;
2844 # generator dict is now FeedParserDict; better tracking of xml:lang,
2845 # including support for xml:lang='' to unset the current language;
2846 # recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
2847 # namespace; don't overwrite final status on redirects (scenarios:
2848 # redirecting to a URL that returns 304, redirecting to a URL that
2849 # redirects to another URL with a different type of redirect); add
2850 # support for HTTP 303 redirects
2851 #4.0 - MAP - support for relative URIs in xml:base attribute; fixed
2852 # encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
2853 # support for Atom 1.0; support for iTunes extensions; new 'tags' for
2854 # categories/keywords/etc. as array of dict
2855 # {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
2856 # terminology; parse RFC 822-style dates with no time; lots of other
2858 #4.1 - MAP - removed socket timeout; added support for chardet library