source: OpenRLabs-Git/deploy/rlabs-docker/web2py-rlabs/gluon/contrib/feedparser.py

main
Last change on this file was 42bd667, checked in by David Fuertes <dfuertes@…>, 4 years ago

Historial Limpio

  • Property mode set to 100755
File size: 156.3 KB
Line 
1"""Universal feed parser
2
3Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
4
5Visit https://code.google.com/p/feedparser/ for the latest version
6Visit http://packages.python.org/feedparser/ for the latest documentation
7
8Required: Python 2.4 or later
9Recommended: iconv_codec <http://cjkpython.i18n.org/>
10"""
11
12__version__ = "5.2.1"
13__license__ = """
14Copyright 2010-2015 Kurt McKee <contactme@kurtmckee.org>
15Copyright 2002-2008 Mark Pilgrim
16All rights reserved.
17
18Redistribution and use in source and binary forms, with or without modification,
19are permitted provided that the following conditions are met:
20
21* Redistributions of source code must retain the above copyright notice,
22  this list of conditions and the following disclaimer.
23* Redistributions in binary form must reproduce the above copyright notice,
24  this list of conditions and the following disclaimer in the documentation
25  and/or other materials provided with the distribution.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE."""
38__author__ = "Mark Pilgrim <http://diveintomark.org/>"
39__contributors__ = ["Jason Diamond <http://injektilo.org/>",
40                    "John Beimler <http://john.beimler.org/>",
41                    "Fazal Majid <http://www.majid.info/mylos/weblog/>",
42                    "Aaron Swartz <http://aaronsw.com/>",
43                    "Kevin Marks <http://epeus.blogspot.com/>",
44                    "Sam Ruby <http://intertwingly.net/>",
45                    "Ade Oshineye <http://blog.oshineye.com/>",
46                    "Martin Pool <http://sourcefrog.net/>",
47                    "Kurt McKee <http://kurtmckee.org/>",
48                    "Bernd Schlapsi <https://github.com/brot>",]
49
50# HTTP "User-Agent" header to send to servers when downloading feeds.
51# If you are embedding feedparser in a larger application, you should
52# change this to your application name and URL.
53USER_AGENT = "UniversalFeedParser/%s +https://code.google.com/p/feedparser/" % __version__
54
55# HTTP "Accept" header to send to servers when downloading feeds.  If you don't
56# want to send an Accept header, set this to None.
57ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
58
59# List of preferred XML parsers, by SAX driver name.  These will be tried first,
60# but if they're not installed, Python will keep searching through its own list
61# of pre-installed parsers until it finds one that supports everything we need.
62PREFERRED_XML_PARSERS = ["drv_libxml2"]
63
64# If you want feedparser to automatically resolve all relative URIs, set this
65# to 1.
66RESOLVE_RELATIVE_URIS = 1
67
68# If you want feedparser to automatically sanitize all potentially unsafe
69# HTML content, set this to 1.
70SANITIZE_HTML = 1
71
72# ---------- Python 3 modules (make it work if possible) ----------
73try:
74    import rfc822
75except ImportError:
76    from email import _parseaddr as rfc822
77
78try:
79    # Python 3.1 introduces bytes.maketrans and simultaneously
80    # deprecates string.maketrans; use bytes.maketrans if possible
81    _maketrans = bytes.maketrans
82except (NameError, AttributeError):
83    import string
84    _maketrans = string.maketrans
85
86# base64 support for Atom feeds that contain embedded binary data
87try:
88    import base64, binascii
89except ImportError:
90    base64 = binascii = None
91else:
92    # Python 3.1 deprecates decodestring in favor of decodebytes
93    _base64decode = getattr(base64, 'decodebytes', base64.decodestring)
94
95# _s2bytes: convert a UTF-8 str to bytes if the interpreter is Python 3
96# _l2bytes: convert a list of ints to bytes if the interpreter is Python 3
97try:
98    if bytes is str:
99        # In Python 2.5 and below, bytes doesn't exist (NameError)
100        # In Python 2.6 and above, bytes and str are the same type
101        raise NameError
102except NameError:
103    # Python 2
104    def _s2bytes(s):
105        return s
106    def _l2bytes(l):
107        return ''.join(map(chr, l))
108else:
109    # Python 3
110    def _s2bytes(s):
111        return bytes(s, 'utf8')
112    def _l2bytes(l):
113        return bytes(l)
114
115# If you want feedparser to allow all URL schemes, set this to ()
116# List culled from Python's urlparse documentation at:
117#   http://docs.python.org/library/urlparse.html
118# as well as from "URI scheme" at Wikipedia:
119#   https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
120# Many more will likely need to be added!
121ACCEPTABLE_URI_SCHEMES = (
122    'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
123    'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
124    'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
125    'wais',
126    # Additional common-but-unofficial schemes
127    'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
128    'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
129)
130#ACCEPTABLE_URI_SCHEMES = ()
131
132# ---------- required modules (should come with any Python distribution) ----------
133import cgi
134import codecs
135import copy
136import datetime
137import itertools
138import re
139import struct
140import time
141import types
142import urllib
143import urllib2
144import urlparse
145import warnings
146
147from htmlentitydefs import name2codepoint, codepoint2name, entitydefs
148
149try:
150    from io import BytesIO as _StringIO
151except ImportError:
152    try:
153        from cStringIO import StringIO as _StringIO
154    except ImportError:
155        from StringIO import StringIO as _StringIO
156
157# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
158
159# gzip is included with most Python distributions, but may not be available if you compiled your own
160try:
161    import gzip
162except ImportError:
163    gzip = None
164try:
165    import zlib
166except ImportError:
167    zlib = None
168
169# If a real XML parser is available, feedparser will attempt to use it.  feedparser has
170# been tested with the built-in SAX parser and libxml2.  On platforms where the
171# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
172# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
173try:
174    import xml.sax
175    from xml.sax.saxutils import escape as _xmlescape
176except ImportError:
177    _XML_AVAILABLE = 0
178    def _xmlescape(data,entities={}):
179        data = data.replace('&', '&amp;')
180        data = data.replace('>', '&gt;')
181        data = data.replace('<', '&lt;')
182        for char, entity in entities:
183            data = data.replace(char, entity)
184        return data
185else:
186    try:
187        xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
188    except xml.sax.SAXReaderNotAvailable:
189        _XML_AVAILABLE = 0
190    else:
191        _XML_AVAILABLE = 1
192
193# sgmllib is not available by default in Python 3; if the end user doesn't have
194# it available then we'll lose illformed XML parsing and content santizing
195try:
196    import sgmllib
197except ImportError:
198    # This is probably Python 3, which doesn't include sgmllib anymore
199    _SGML_AVAILABLE = 0
200
201    # Mock sgmllib enough to allow subclassing later on
202    class sgmllib(object):
203        class SGMLParser(object):
204            def goahead(self, i):
205                pass
206            def parse_starttag(self, i):
207                pass
208else:
209    _SGML_AVAILABLE = 1
210
211    # sgmllib defines a number of module-level regular expressions that are
212    # insufficient for the XML parsing feedparser needs. Rather than modify
213    # the variables directly in sgmllib, they're defined here using the same
214    # names, and the compiled code objects of several sgmllib.SGMLParser
215    # methods are copied into _BaseHTMLProcessor so that they execute in
216    # feedparser's scope instead of sgmllib's scope.
217    charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);')
218    tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
219    attrfind = re.compile(
220        r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*'
221        r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?'
222    )
223
224    # Unfortunately, these must be copied over to prevent NameError exceptions
225    entityref = sgmllib.entityref
226    incomplete = sgmllib.incomplete
227    interesting = sgmllib.interesting
228    shorttag = sgmllib.shorttag
229    shorttagopen = sgmllib.shorttagopen
230    starttagopen = sgmllib.starttagopen
231
232    class _EndBracketRegEx:
233        def __init__(self):
234            # Overriding the built-in sgmllib.endbracket regex allows the
235            # parser to find angle brackets embedded in element attributes.
236            self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
237        def search(self, target, index=0):
238            match = self.endbracket.match(target, index)
239            if match is not None:
240                # Returning a new object in the calling thread's context
241                # resolves a thread-safety.
242                return EndBracketMatch(match)
243            return None
244    class EndBracketMatch:
245        def __init__(self, match):
246            self.match = match
247        def start(self, n):
248            return self.match.end(n)
249    endbracket = _EndBracketRegEx()
250
251
252# iconv_codec provides support for more character encodings.
253# It's available from http://cjkpython.i18n.org/
254try:
255    import iconv_codec
256except ImportError:
257    pass
258
259# chardet library auto-detects character encodings
260# Download from http://chardet.feedparser.org/
261try:
262    import chardet
263except ImportError:
264    chardet = None
265
266# ---------- don't touch these ----------
267class ThingsNobodyCaresAboutButMe(Exception): pass
268class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
269class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
270class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
271class UndeclaredNamespace(Exception): pass
272
273SUPPORTED_VERSIONS = {'': u'unknown',
274                      'rss090': u'RSS 0.90',
275                      'rss091n': u'RSS 0.91 (Netscape)',
276                      'rss091u': u'RSS 0.91 (Userland)',
277                      'rss092': u'RSS 0.92',
278                      'rss093': u'RSS 0.93',
279                      'rss094': u'RSS 0.94',
280                      'rss20': u'RSS 2.0',
281                      'rss10': u'RSS 1.0',
282                      'rss': u'RSS (unknown version)',
283                      'atom01': u'Atom 0.1',
284                      'atom02': u'Atom 0.2',
285                      'atom03': u'Atom 0.3',
286                      'atom10': u'Atom 1.0',
287                      'atom': u'Atom (unknown version)',
288                      'cdf': u'CDF',
289                      }
290
291class FeedParserDict(dict):
292    keymap = {'channel': 'feed',
293              'items': 'entries',
294              'guid': 'id',
295              'date': 'updated',
296              'date_parsed': 'updated_parsed',
297              'description': ['summary', 'subtitle'],
298              'description_detail': ['summary_detail', 'subtitle_detail'],
299              'url': ['href'],
300              'modified': 'updated',
301              'modified_parsed': 'updated_parsed',
302              'issued': 'published',
303              'issued_parsed': 'published_parsed',
304              'copyright': 'rights',
305              'copyright_detail': 'rights_detail',
306              'tagline': 'subtitle',
307              'tagline_detail': 'subtitle_detail'}
308    def __getitem__(self, key):
309        '''
310        :return: A :class:`FeedParserDict`.
311        '''
312        if key == 'category':
313            try:
314                return dict.__getitem__(self, 'tags')[0]['term']
315            except IndexError:
316                raise KeyError, "object doesn't have key 'category'"
317        elif key == 'enclosures':
318            norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
319            return [norel(link) for link in dict.__getitem__(self, 'links') if link['rel']==u'enclosure']
320        elif key == 'license':
321            for link in dict.__getitem__(self, 'links'):
322                if link['rel']==u'license' and 'href' in link:
323                    return link['href']
324        elif key == 'updated':
325            # Temporarily help developers out by keeping the old
326            # broken behavior that was reported in issue 310.
327            # This fix was proposed in issue 328.
328            if not dict.__contains__(self, 'updated') and \
329                dict.__contains__(self, 'published'):
330                warnings.warn("To avoid breaking existing software while "
331                    "fixing issue 310, a temporary mapping has been created "
332                    "from `updated` to `published` if `updated` doesn't "
333                    "exist. This fallback will be removed in a future version "
334                    "of feedparser.", DeprecationWarning)
335                return dict.__getitem__(self, 'published')
336            return dict.__getitem__(self, 'updated')
337        elif key == 'updated_parsed':
338            if not dict.__contains__(self, 'updated_parsed') and \
339                dict.__contains__(self, 'published_parsed'):
340                warnings.warn("To avoid breaking existing software while "
341                    "fixing issue 310, a temporary mapping has been created "
342                    "from `updated_parsed` to `published_parsed` if "
343                    "`updated_parsed` doesn't exist. This fallback will be "
344                    "removed in a future version of feedparser.",
345                    DeprecationWarning)
346                return dict.__getitem__(self, 'published_parsed')
347            return dict.__getitem__(self, 'updated_parsed')
348        else:
349            realkey = self.keymap.get(key, key)
350            if isinstance(realkey, list):
351                for k in realkey:
352                    if dict.__contains__(self, k):
353                        return dict.__getitem__(self, k)
354            elif dict.__contains__(self, realkey):
355                return dict.__getitem__(self, realkey)
356        return dict.__getitem__(self, key)
357
358    def __contains__(self, key):
359        if key in ('updated', 'updated_parsed'):
360            # Temporarily help developers out by keeping the old
361            # broken behavior that was reported in issue 310.
362            # This fix was proposed in issue 328.
363            return dict.__contains__(self, key)
364        try:
365            self.__getitem__(key)
366        except KeyError:
367            return False
368        else:
369            return True
370
371    has_key = __contains__
372
373    def get(self, key, default=None):
374        '''
375        :return: A :class:`FeedParserDict`.
376        '''
377        try:
378            return self.__getitem__(key)
379        except KeyError:
380            return default
381
382    def __setitem__(self, key, value):
383        key = self.keymap.get(key, key)
384        if isinstance(key, list):
385            key = key[0]
386        return dict.__setitem__(self, key, value)
387
388    def setdefault(self, key, value):
389        if key not in self:
390            self[key] = value
391            return value
392        return self[key]
393
394    def __getattr__(self, key):
395        # __getattribute__() is called first; this will be called
396        # only if an attribute was not already found
397        try:
398            return self.__getitem__(key)
399        except KeyError:
400            raise AttributeError, "object has no attribute '%s'" % key
401
402    def __hash__(self):
403        return id(self)
404
405_cp1252 = {
406    128: unichr(8364), # euro sign
407    130: unichr(8218), # single low-9 quotation mark
408    131: unichr( 402), # latin small letter f with hook
409    132: unichr(8222), # double low-9 quotation mark
410    133: unichr(8230), # horizontal ellipsis
411    134: unichr(8224), # dagger
412    135: unichr(8225), # double dagger
413    136: unichr( 710), # modifier letter circumflex accent
414    137: unichr(8240), # per mille sign
415    138: unichr( 352), # latin capital letter s with caron
416    139: unichr(8249), # single left-pointing angle quotation mark
417    140: unichr( 338), # latin capital ligature oe
418    142: unichr( 381), # latin capital letter z with caron
419    145: unichr(8216), # left single quotation mark
420    146: unichr(8217), # right single quotation mark
421    147: unichr(8220), # left double quotation mark
422    148: unichr(8221), # right double quotation mark
423    149: unichr(8226), # bullet
424    150: unichr(8211), # en dash
425    151: unichr(8212), # em dash
426    152: unichr( 732), # small tilde
427    153: unichr(8482), # trade mark sign
428    154: unichr( 353), # latin small letter s with caron
429    155: unichr(8250), # single right-pointing angle quotation mark
430    156: unichr( 339), # latin small ligature oe
431    158: unichr( 382), # latin small letter z with caron
432    159: unichr( 376), # latin capital letter y with diaeresis
433}
434
435_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
436def _urljoin(base, uri):
437    uri = _urifixer.sub(r'\1\3', uri)
438    if not isinstance(uri, unicode):
439        uri = uri.decode('utf-8', 'ignore')
440    try:
441        uri = urlparse.urljoin(base, uri)
442    except ValueError:
443        uri = u''
444    if not isinstance(uri, unicode):
445        return uri.decode('utf-8', 'ignore')
446    return uri
447
448class _FeedParserMixin:
449    namespaces = {
450        '': '',
451        'http://backend.userland.com/rss': '',
452        'http://blogs.law.harvard.edu/tech/rss': '',
453        'http://purl.org/rss/1.0/': '',
454        'http://my.netscape.com/rdf/simple/0.9/': '',
455        'http://example.com/newformat#': '',
456        'http://example.com/necho': '',
457        'http://purl.org/echo/': '',
458        'uri/of/echo/namespace#': '',
459        'http://purl.org/pie/': '',
460        'http://purl.org/atom/ns#': '',
461        'http://www.w3.org/2005/Atom': '',
462        'http://purl.org/rss/1.0/modules/rss091#': '',
463
464        'http://webns.net/mvcb/':                                'admin',
465        'http://purl.org/rss/1.0/modules/aggregation/':          'ag',
466        'http://purl.org/rss/1.0/modules/annotate/':             'annotate',
467        'http://media.tangent.org/rss/1.0/':                     'audio',
468        'http://backend.userland.com/blogChannelModule':         'blogChannel',
469        'http://web.resource.org/cc/':                           'cc',
470        'http://backend.userland.com/creativeCommonsRssModule':  'creativeCommons',
471        'http://purl.org/rss/1.0/modules/company':               'co',
472        'http://purl.org/rss/1.0/modules/content/':              'content',
473        'http://my.theinfo.org/changed/1.0/rss/':                'cp',
474        'http://purl.org/dc/elements/1.1/':                      'dc',
475        'http://purl.org/dc/terms/':                             'dcterms',
476        'http://purl.org/rss/1.0/modules/email/':                'email',
477        'http://purl.org/rss/1.0/modules/event/':                'ev',
478        'http://rssnamespace.org/feedburner/ext/1.0':            'feedburner',
479        'http://freshmeat.net/rss/fm/':                          'fm',
480        'http://xmlns.com/foaf/0.1/':                            'foaf',
481        'http://www.w3.org/2003/01/geo/wgs84_pos#':              'geo',
482        'http://www.georss.org/georss':                          'georss',
483        'http://www.opengis.net/gml':                            'gml',
484        'http://postneo.com/icbm/':                              'icbm',
485        'http://purl.org/rss/1.0/modules/image/':                'image',
486        'http://www.itunes.com/DTDs/PodCast-1.0.dtd':            'itunes',
487        'http://example.com/DTDs/PodCast-1.0.dtd':               'itunes',
488        'http://purl.org/rss/1.0/modules/link/':                 'l',
489        'http://search.yahoo.com/mrss':                          'media',
490        # Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
491        'http://search.yahoo.com/mrss/':                         'media',
492        'http://madskills.com/public/xml/rss/module/pingback/':  'pingback',
493        'http://prismstandard.org/namespaces/1.2/basic/':        'prism',
494        'http://www.w3.org/1999/02/22-rdf-syntax-ns#':           'rdf',
495        'http://www.w3.org/2000/01/rdf-schema#':                 'rdfs',
496        'http://purl.org/rss/1.0/modules/reference/':            'ref',
497        'http://purl.org/rss/1.0/modules/richequiv/':            'reqv',
498        'http://purl.org/rss/1.0/modules/search/':               'search',
499        'http://purl.org/rss/1.0/modules/slash/':                'slash',
500        'http://schemas.xmlsoap.org/soap/envelope/':             'soap',
501        'http://purl.org/rss/1.0/modules/servicestatus/':        'ss',
502        'http://hacks.benhammersley.com/rss/streaming/':         'str',
503        'http://purl.org/rss/1.0/modules/subscription/':         'sub',
504        'http://purl.org/rss/1.0/modules/syndication/':          'sy',
505        'http://schemas.pocketsoap.com/rss/myDescModule/':       'szf',
506        'http://purl.org/rss/1.0/modules/taxonomy/':             'taxo',
507        'http://purl.org/rss/1.0/modules/threading/':            'thr',
508        'http://purl.org/rss/1.0/modules/textinput/':            'ti',
509        'http://madskills.com/public/xml/rss/module/trackback/': 'trackback',
510        'http://wellformedweb.org/commentAPI/':                  'wfw',
511        'http://purl.org/rss/1.0/modules/wiki/':                 'wiki',
512        'http://www.w3.org/1999/xhtml':                          'xhtml',
513        'http://www.w3.org/1999/xlink':                          'xlink',
514        'http://www.w3.org/XML/1998/namespace':                  'xml',
515        'http://podlove.org/simple-chapters':                    'psc',
516    }
517    _matchnamespaces = {}
518
519    can_be_relative_uri = set(['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'])
520    can_contain_relative_uris = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'])
521    can_contain_dangerous_markup = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'])
522    html_types = [u'text/html', u'application/xhtml+xml']
523
524    def __init__(self, baseuri=None, baselang=None, encoding=u'utf-8'):
525        if not self._matchnamespaces:
526            for k, v in self.namespaces.items():
527                self._matchnamespaces[k.lower()] = v
528        self.feeddata = FeedParserDict() # feed-level data
529        self.encoding = encoding # character encoding
530        self.entries = [] # list of entry-level data
531        self.version = u'' # feed type/version, see SUPPORTED_VERSIONS
532        self.namespacesInUse = {} # dictionary of namespaces defined by the feed
533
534        # the following are used internally to track state;
535        # this is really out of control and should be refactored
536        self.infeed = 0
537        self.inentry = 0
538        self.incontent = 0
539        self.intextinput = 0
540        self.inimage = 0
541        self.inauthor = 0
542        self.incontributor = 0
543        self.inpublisher = 0
544        self.insource = 0
545
546        # georss
547        self.ingeometry = 0
548
549        self.sourcedata = FeedParserDict()
550        self.contentparams = FeedParserDict()
551        self._summaryKey = None
552        self.namespacemap = {}
553        self.elementstack = []
554        self.basestack = []
555        self.langstack = []
556        self.baseuri = baseuri or u''
557        self.lang = baselang or None
558        self.svgOK = 0
559        self.title_depth = -1
560        self.depth = 0
561        # psc_chapters_flag prevents multiple psc_chapters from being
562        # captured in a single entry or item. The transition states are
563        # None -> True -> False. psc_chapter elements will only be
564        # captured while it is True.
565        self.psc_chapters_flag = None
566        if baselang:
567            self.feeddata['language'] = baselang.replace('_','-')
568
569        # A map of the following form:
570        #     {
571        #         object_that_value_is_set_on: {
572        #             property_name: depth_of_node_property_was_extracted_from,
573        #             other_property: depth_of_node_property_was_extracted_from,
574        #         },
575        #     }
576        self.property_depth_map = {}
577
578    def _normalize_attributes(self, kv):
579        k = kv[0].lower()
580        v = k in ('rel', 'type') and kv[1].lower() or kv[1]
581        # the sgml parser doesn't handle entities in attributes, nor
582        # does it pass the attribute values through as unicode, while
583        # strict xml parsers do -- account for this difference
584        if isinstance(self, _LooseFeedParser):
585            v = v.replace('&amp;', '&')
586            if not isinstance(v, unicode):
587                v = v.decode('utf-8')
588        return (k, v)
589
590    def unknown_starttag(self, tag, attrs):
591        # increment depth counter
592        self.depth += 1
593
594        # normalize attrs
595        attrs = map(self._normalize_attributes, attrs)
596
597        # track xml:base and xml:lang
598        attrsD = dict(attrs)
599        baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
600        if not isinstance(baseuri, unicode):
601            baseuri = baseuri.decode(self.encoding, 'ignore')
602        # ensure that self.baseuri is always an absolute URI that
603        # uses a whitelisted URI scheme (e.g. not `javscript:`)
604        if self.baseuri:
605            self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri
606        else:
607            self.baseuri = _urljoin(self.baseuri, baseuri)
608        lang = attrsD.get('xml:lang', attrsD.get('lang'))
609        if lang == '':
610            # xml:lang could be explicitly set to '', we need to capture that
611            lang = None
612        elif lang is None:
613            # if no xml:lang is specified, use parent lang
614            lang = self.lang
615        if lang:
616            if tag in ('feed', 'rss', 'rdf:RDF'):
617                self.feeddata['language'] = lang.replace('_','-')
618        self.lang = lang
619        self.basestack.append(self.baseuri)
620        self.langstack.append(lang)
621
622        # track namespaces
623        for prefix, uri in attrs:
624            if prefix.startswith('xmlns:'):
625                self.trackNamespace(prefix[6:], uri)
626            elif prefix == 'xmlns':
627                self.trackNamespace(None, uri)
628
629        # track inline content
630        if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'):
631            if tag in ('xhtml:div', 'div'):
632                return # typepad does this 10/2007
633            # element declared itself as escaped markup, but it isn't really
634            self.contentparams['type'] = u'application/xhtml+xml'
635        if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
636            if tag.find(':') <> -1:
637                prefix, tag = tag.split(':', 1)
638                namespace = self.namespacesInUse.get(prefix, '')
639                if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
640                    attrs.append(('xmlns',namespace))
641                if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
642                    attrs.append(('xmlns',namespace))
643            if tag == 'svg':
644                self.svgOK += 1
645            return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
646
647        # match namespaces
648        if tag.find(':') <> -1:
649            prefix, suffix = tag.split(':', 1)
650        else:
651            prefix, suffix = '', tag
652        prefix = self.namespacemap.get(prefix, prefix)
653        if prefix:
654            prefix = prefix + '_'
655
656        # special hack for better tracking of empty textinput/image elements in illformed feeds
657        if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
658            self.intextinput = 0
659        if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
660            self.inimage = 0
661
662        # call special handler (if defined) or default handler
663        methodname = '_start_' + prefix + suffix
664        try:
665            method = getattr(self, methodname)
666            return method(attrsD)
667        except AttributeError:
668            # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
669            unknown_tag = prefix + suffix
670            if len(attrsD) == 0:
671                # No attributes so merge it into the encosing dictionary
672                return self.push(unknown_tag, 1)
673            else:
674                # Has attributes so create it in its own dictionary
675                context = self._getContext()
676                context[unknown_tag] = attrsD
677
678    def unknown_endtag(self, tag):
679        # match namespaces
680        if tag.find(':') <> -1:
681            prefix, suffix = tag.split(':', 1)
682        else:
683            prefix, suffix = '', tag
684        prefix = self.namespacemap.get(prefix, prefix)
685        if prefix:
686            prefix = prefix + '_'
687        if suffix == 'svg' and self.svgOK:
688            self.svgOK -= 1
689
690        # call special handler (if defined) or default handler
691        methodname = '_end_' + prefix + suffix
692        try:
693            if self.svgOK:
694                raise AttributeError()
695            method = getattr(self, methodname)
696            method()
697        except AttributeError:
698            self.pop(prefix + suffix)
699
700        # track inline content
701        if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'):
702            # element declared itself as escaped markup, but it isn't really
703            if tag in ('xhtml:div', 'div'):
704                return # typepad does this 10/2007
705            self.contentparams['type'] = u'application/xhtml+xml'
706        if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
707            tag = tag.split(':')[-1]
708            self.handle_data('</%s>' % tag, escape=0)
709
710        # track xml:base and xml:lang going out of scope
711        if self.basestack:
712            self.basestack.pop()
713            if self.basestack and self.basestack[-1]:
714                self.baseuri = self.basestack[-1]
715        if self.langstack:
716            self.langstack.pop()
717            if self.langstack: # and (self.langstack[-1] is not None):
718                self.lang = self.langstack[-1]
719
720        self.depth -= 1
721
722    def handle_charref(self, ref):
723        # called for each character reference, e.g. for '&#160;', ref will be '160'
724        if not self.elementstack:
725            return
726        ref = ref.lower()
727        if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
728            text = '&#%s;' % ref
729        else:
730            if ref[0] == 'x':
731                c = int(ref[1:], 16)
732            else:
733                c = int(ref)
734            text = unichr(c).encode('utf-8')
735        self.elementstack[-1][2].append(text)
736
737    def handle_entityref(self, ref):
738        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
739        if not self.elementstack:
740            return
741        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
742            text = '&%s;' % ref
743        elif ref in self.entities:
744            text = self.entities[ref]
745            if text.startswith('&#') and text.endswith(';'):
746                return self.handle_entityref(text)
747        else:
748            try:
749                name2codepoint[ref]
750            except KeyError:
751                text = '&%s;' % ref
752            else:
753                text = unichr(name2codepoint[ref]).encode('utf-8')
754        self.elementstack[-1][2].append(text)
755
756    def handle_data(self, text, escape=1):
757        # called for each block of plain text, i.e. outside of any tag and
758        # not containing any character or entity references
759        if not self.elementstack:
760            return
761        if escape and self.contentparams.get('type') == u'application/xhtml+xml':
762            text = _xmlescape(text)
763        self.elementstack[-1][2].append(text)
764
765    def handle_comment(self, text):
766        # called for each comment, e.g. <!-- insert message here -->
767        pass
768
769    def handle_pi(self, text):
770        # called for each processing instruction, e.g. <?instruction>
771        pass
772
773    def handle_decl(self, text):
774        pass
775
776    def parse_declaration(self, i):
777        # override internal declaration handler to handle CDATA blocks
778        if self.rawdata[i:i+9] == '<![CDATA[':
779            k = self.rawdata.find(']]>', i)
780            if k == -1:
781                # CDATA block began but didn't finish
782                k = len(self.rawdata)
783                return k
784            self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
785            return k+3
786        else:
787            k = self.rawdata.find('>', i)
788            if k >= 0:
789                return k+1
790            else:
791                # We have an incomplete CDATA block.
792                return k
793
794    def mapContentType(self, contentType):
795        contentType = contentType.lower()
796        if contentType == 'text' or contentType == 'plain':
797            contentType = u'text/plain'
798        elif contentType == 'html':
799            contentType = u'text/html'
800        elif contentType == 'xhtml':
801            contentType = u'application/xhtml+xml'
802        return contentType
803
804    def trackNamespace(self, prefix, uri):
805        loweruri = uri.lower()
806        if not self.version:
807            if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/'):
808                self.version = u'rss090'
809            elif loweruri == 'http://purl.org/rss/1.0/':
810                self.version = u'rss10'
811            elif loweruri == 'http://www.w3.org/2005/atom':
812                self.version = u'atom10'
813        if loweruri.find(u'backend.userland.com/rss') <> -1:
814            # match any backend.userland.com namespace
815            uri = u'http://backend.userland.com/rss'
816            loweruri = uri
817        if loweruri in self._matchnamespaces:
818            self.namespacemap[prefix] = self._matchnamespaces[loweruri]
819            self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
820        else:
821            self.namespacesInUse[prefix or ''] = uri
822
823    def resolveURI(self, uri):
824        return _urljoin(self.baseuri or u'', uri)
825
826    def decodeEntities(self, element, data):
827        return data
828
829    def strattrs(self, attrs):
830        return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'&quot;'})) for t in attrs])
831
832    def push(self, element, expectingText):
833        self.elementstack.append([element, expectingText, []])
834
835    def pop(self, element, stripWhitespace=1):
836        if not self.elementstack:
837            return
838        if self.elementstack[-1][0] != element:
839            return
840
841        element, expectingText, pieces = self.elementstack.pop()
842
843        if self.version == u'atom10' and self.contentparams.get('type', u'text') == u'application/xhtml+xml':
844            # remove enclosing child element, but only if it is a <div> and
845            # only if all the remaining content is nested underneath it.
846            # This means that the divs would be retained in the following:
847            #    <div>foo</div><div>bar</div>
848            while pieces and len(pieces)>1 and not pieces[-1].strip():
849                del pieces[-1]
850            while pieces and len(pieces)>1 and not pieces[0].strip():
851                del pieces[0]
852            if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
853                depth = 0
854                for piece in pieces[:-1]:
855                    if piece.startswith('</'):
856                        depth -= 1
857                        if depth == 0:
858                            break
859                    elif piece.startswith('<') and not piece.endswith('/>'):
860                        depth += 1
861                else:
862                    pieces = pieces[1:-1]
863
864        # Ensure each piece is a str for Python 3
865        for (i, v) in enumerate(pieces):
866            if not isinstance(v, unicode):
867                pieces[i] = v.decode('utf-8')
868
869        output = u''.join(pieces)
870        if stripWhitespace:
871            output = output.strip()
872        if not expectingText:
873            return output
874
875        # decode base64 content
876        if base64 and self.contentparams.get('base64', 0):
877            try:
878                output = _base64decode(output)
879            except binascii.Error:
880                pass
881            except binascii.Incomplete:
882                pass
883            except TypeError:
884                # In Python 3, base64 takes and outputs bytes, not str
885                # This may not be the most correct way to accomplish this
886                output = _base64decode(output.encode('utf-8')).decode('utf-8')
887
888        # resolve relative URIs
889        if (element in self.can_be_relative_uri) and output:
890            # do not resolve guid elements with isPermalink="false"
891            if not element == 'id' or self.guidislink:
892                output = self.resolveURI(output)
893
894        # decode entities within embedded markup
895        if not self.contentparams.get('base64', 0):
896            output = self.decodeEntities(element, output)
897
898        # some feed formats require consumers to guess
899        # whether the content is html or plain text
900        if not self.version.startswith(u'atom') and self.contentparams.get('type') == u'text/plain':
901            if self.lookslikehtml(output):
902                self.contentparams['type'] = u'text/html'
903
904        # remove temporary cruft from contentparams
905        try:
906            del self.contentparams['mode']
907        except KeyError:
908            pass
909        try:
910            del self.contentparams['base64']
911        except KeyError:
912            pass
913
914        is_htmlish = self.mapContentType(self.contentparams.get('type', u'text/html')) in self.html_types
915        # resolve relative URIs within embedded markup
916        if is_htmlish and RESOLVE_RELATIVE_URIS:
917            if element in self.can_contain_relative_uris:
918                output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html'))
919
920        # sanitize embedded markup
921        if is_htmlish and SANITIZE_HTML:
922            if element in self.can_contain_dangerous_markup:
923                output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', u'text/html'))
924
925        if self.encoding and not isinstance(output, unicode):
926            output = output.decode(self.encoding, 'ignore')
927
928        # address common error where people take data that is already
929        # utf-8, presume that it is iso-8859-1, and re-encode it.
930        if self.encoding in (u'utf-8', u'utf-8_INVALID_PYTHON_3') and isinstance(output, unicode):
931            try:
932                output = output.encode('iso-8859-1').decode('utf-8')
933            except (UnicodeEncodeError, UnicodeDecodeError):
934                pass
935
936        # map win-1252 extensions to the proper code points
937        if isinstance(output, unicode):
938            output = output.translate(_cp1252)
939
940        # categories/tags/keywords/whatever are handled in _end_category or _end_tags or _end_itunes_keywords
941        if element in ('category', 'tags', 'itunes_keywords'):
942            return output
943
944        if element == 'title' and -1 < self.title_depth <= self.depth:
945            return output
946
947        # store output in appropriate place(s)
948        if self.inentry and not self.insource:
949            if element == 'content':
950                self.entries[-1].setdefault(element, [])
951                contentparams = copy.deepcopy(self.contentparams)
952                contentparams['value'] = output
953                self.entries[-1][element].append(contentparams)
954            elif element == 'link':
955                if not self.inimage:
956                    # query variables in urls in link elements are improperly
957                    # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
958                    # unhandled character references. fix this special case.
959                    output = output.replace('&amp;', '&')
960                    output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
961                    self.entries[-1][element] = output
962                    if output:
963                        self.entries[-1]['links'][-1]['href'] = output
964            else:
965                if element == 'description':
966                    element = 'summary'
967                old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element)
968                if old_value_depth is None or self.depth <= old_value_depth:
969                    self.property_depth_map[self.entries[-1]][element] = self.depth
970                    self.entries[-1][element] = output
971                if self.incontent:
972                    contentparams = copy.deepcopy(self.contentparams)
973                    contentparams['value'] = output
974                    self.entries[-1][element + '_detail'] = contentparams
975        elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
976            context = self._getContext()
977            if element == 'description':
978                element = 'subtitle'
979            context[element] = output
980            if element == 'link':
981                # fix query variables; see above for the explanation
982                output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
983                context[element] = output
984                context['links'][-1]['href'] = output
985            elif self.incontent:
986                contentparams = copy.deepcopy(self.contentparams)
987                contentparams['value'] = output
988                context[element + '_detail'] = contentparams
989        return output
990
991    def pushContent(self, tag, attrsD, defaultContentType, expectingText):
992        self.incontent += 1
993        if self.lang:
994            self.lang=self.lang.replace('_','-')
995        self.contentparams = FeedParserDict({
996            'type': self.mapContentType(attrsD.get('type', defaultContentType)),
997            'language': self.lang,
998            'base': self.baseuri})
999        self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
1000        self.push(tag, expectingText)
1001
1002    def popContent(self, tag):
1003        value = self.pop(tag)
1004        self.incontent -= 1
1005        self.contentparams.clear()
1006        return value
1007
1008    # a number of elements in a number of RSS variants are nominally plain
1009    # text, but this is routinely ignored.  This is an attempt to detect
1010    # the most common cases.  As false positives often result in silent
1011    # data loss, this function errs on the conservative side.
1012    @staticmethod
1013    def lookslikehtml(s):
1014        # must have a close tag or an entity reference to qualify
1015        if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)):
1016            return
1017
1018        # all tags must be in a restricted subset of valid HTML tags
1019        if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
1020            re.findall(r'</?(\w+)',s)):
1021            return
1022
1023        # all entities must have been defined as valid HTML entities
1024        if filter(lambda e: e not in entitydefs.keys(), re.findall(r'&(\w+);', s)):
1025            return
1026
1027        return 1
1028
1029    def _mapToStandardPrefix(self, name):
1030        colonpos = name.find(':')
1031        if colonpos <> -1:
1032            prefix = name[:colonpos]
1033            suffix = name[colonpos+1:]
1034            prefix = self.namespacemap.get(prefix, prefix)
1035            name = prefix + ':' + suffix
1036        return name
1037
1038    def _getAttribute(self, attrsD, name):
1039        return attrsD.get(self._mapToStandardPrefix(name))
1040
1041    def _isBase64(self, attrsD, contentparams):
1042        if attrsD.get('mode', '') == 'base64':
1043            return 1
1044        if self.contentparams['type'].startswith(u'text/'):
1045            return 0
1046        if self.contentparams['type'].endswith(u'+xml'):
1047            return 0
1048        if self.contentparams['type'].endswith(u'/xml'):
1049            return 0
1050        return 1
1051
1052    def _itsAnHrefDamnIt(self, attrsD):
1053        href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
1054        if href:
1055            try:
1056                del attrsD['url']
1057            except KeyError:
1058                pass
1059            try:
1060                del attrsD['uri']
1061            except KeyError:
1062                pass
1063            attrsD['href'] = href
1064        return attrsD
1065
1066    def _save(self, key, value, overwrite=False):
1067        context = self._getContext()
1068        if overwrite:
1069            context[key] = value
1070        else:
1071            context.setdefault(key, value)
1072
1073    def _start_rss(self, attrsD):
1074        versionmap = {'0.91': u'rss091u',
1075                      '0.92': u'rss092',
1076                      '0.93': u'rss093',
1077                      '0.94': u'rss094'}
1078        #If we're here then this is an RSS feed.
1079        #If we don't have a version or have a version that starts with something
1080        #other than RSS then there's been a mistake. Correct it.
1081        if not self.version or not self.version.startswith(u'rss'):
1082            attr_version = attrsD.get('version', '')
1083            version = versionmap.get(attr_version)
1084            if version:
1085                self.version = version
1086            elif attr_version.startswith('2.'):
1087                self.version = u'rss20'
1088            else:
1089                self.version = u'rss'
1090
1091    def _start_channel(self, attrsD):
1092        self.infeed = 1
1093        self._cdf_common(attrsD)
1094
1095    def _cdf_common(self, attrsD):
1096        if 'lastmod' in attrsD:
1097            self._start_modified({})
1098            self.elementstack[-1][-1] = attrsD['lastmod']
1099            self._end_modified()
1100        if 'href' in attrsD:
1101            self._start_link({})
1102            self.elementstack[-1][-1] = attrsD['href']
1103            self._end_link()
1104
1105    def _start_feed(self, attrsD):
1106        self.infeed = 1
1107        versionmap = {'0.1': u'atom01',
1108                      '0.2': u'atom02',
1109                      '0.3': u'atom03'}
1110        if not self.version:
1111            attr_version = attrsD.get('version')
1112            version = versionmap.get(attr_version)
1113            if version:
1114                self.version = version
1115            else:
1116                self.version = u'atom'
1117
1118    def _end_channel(self):
1119        self.infeed = 0
1120    _end_feed = _end_channel
1121
1122    def _start_image(self, attrsD):
1123        context = self._getContext()
1124        if not self.inentry:
1125            context.setdefault('image', FeedParserDict())
1126        self.inimage = 1
1127        self.title_depth = -1
1128        self.push('image', 0)
1129
1130    def _end_image(self):
1131        self.pop('image')
1132        self.inimage = 0
1133
1134    def _start_textinput(self, attrsD):
1135        context = self._getContext()
1136        context.setdefault('textinput', FeedParserDict())
1137        self.intextinput = 1
1138        self.title_depth = -1
1139        self.push('textinput', 0)
1140    _start_textInput = _start_textinput
1141
1142    def _end_textinput(self):
1143        self.pop('textinput')
1144        self.intextinput = 0
1145    _end_textInput = _end_textinput
1146
1147    def _start_author(self, attrsD):
1148        self.inauthor = 1
1149        self.push('author', 1)
1150        # Append a new FeedParserDict when expecting an author
1151        context = self._getContext()
1152        context.setdefault('authors', [])
1153        context['authors'].append(FeedParserDict())
1154    _start_managingeditor = _start_author
1155    _start_dc_author = _start_author
1156    _start_dc_creator = _start_author
1157    _start_itunes_author = _start_author
1158
1159    def _end_author(self):
1160        self.pop('author')
1161        self.inauthor = 0
1162        self._sync_author_detail()
1163    _end_managingeditor = _end_author
1164    _end_dc_author = _end_author
1165    _end_dc_creator = _end_author
1166    _end_itunes_author = _end_author
1167
1168    def _start_itunes_owner(self, attrsD):
1169        self.inpublisher = 1
1170        self.push('publisher', 0)
1171
1172    def _end_itunes_owner(self):
1173        self.pop('publisher')
1174        self.inpublisher = 0
1175        self._sync_author_detail('publisher')
1176
1177    def _start_contributor(self, attrsD):
1178        self.incontributor = 1
1179        context = self._getContext()
1180        context.setdefault('contributors', [])
1181        context['contributors'].append(FeedParserDict())
1182        self.push('contributor', 0)
1183
1184    def _end_contributor(self):
1185        self.pop('contributor')
1186        self.incontributor = 0
1187
1188    def _start_dc_contributor(self, attrsD):
1189        self.incontributor = 1
1190        context = self._getContext()
1191        context.setdefault('contributors', [])
1192        context['contributors'].append(FeedParserDict())
1193        self.push('name', 0)
1194
1195    def _end_dc_contributor(self):
1196        self._end_name()
1197        self.incontributor = 0
1198
1199    def _start_name(self, attrsD):
1200        self.push('name', 0)
1201    _start_itunes_name = _start_name
1202
1203    def _end_name(self):
1204        value = self.pop('name')
1205        if self.inpublisher:
1206            self._save_author('name', value, 'publisher')
1207        elif self.inauthor:
1208            self._save_author('name', value)
1209        elif self.incontributor:
1210            self._save_contributor('name', value)
1211        elif self.intextinput:
1212            context = self._getContext()
1213            context['name'] = value
1214    _end_itunes_name = _end_name
1215
1216    def _start_width(self, attrsD):
1217        self.push('width', 0)
1218
1219    def _end_width(self):
1220        value = self.pop('width')
1221        try:
1222            value = int(value)
1223        except ValueError:
1224            value = 0
1225        if self.inimage:
1226            context = self._getContext()
1227            context['width'] = value
1228
1229    def _start_height(self, attrsD):
1230        self.push('height', 0)
1231
1232    def _end_height(self):
1233        value = self.pop('height')
1234        try:
1235            value = int(value)
1236        except ValueError:
1237            value = 0
1238        if self.inimage:
1239            context = self._getContext()
1240            context['height'] = value
1241
1242    def _start_url(self, attrsD):
1243        self.push('href', 1)
1244    _start_homepage = _start_url
1245    _start_uri = _start_url
1246
1247    def _end_url(self):
1248        value = self.pop('href')
1249        if self.inauthor:
1250            self._save_author('href', value)
1251        elif self.incontributor:
1252            self._save_contributor('href', value)
1253    _end_homepage = _end_url
1254    _end_uri = _end_url
1255
1256    def _start_email(self, attrsD):
1257        self.push('email', 0)
1258    _start_itunes_email = _start_email
1259
1260    def _end_email(self):
1261        value = self.pop('email')
1262        if self.inpublisher:
1263            self._save_author('email', value, 'publisher')
1264        elif self.inauthor:
1265            self._save_author('email', value)
1266        elif self.incontributor:
1267            self._save_contributor('email', value)
1268    _end_itunes_email = _end_email
1269
1270    def _getContext(self):
1271        if self.insource:
1272            context = self.sourcedata
1273        elif self.inimage and 'image' in self.feeddata:
1274            context = self.feeddata['image']
1275        elif self.intextinput:
1276            context = self.feeddata['textinput']
1277        elif self.inentry:
1278            context = self.entries[-1]
1279        else:
1280            context = self.feeddata
1281        return context
1282
1283    def _save_author(self, key, value, prefix='author'):
1284        context = self._getContext()
1285        context.setdefault(prefix + '_detail', FeedParserDict())
1286        context[prefix + '_detail'][key] = value
1287        self._sync_author_detail()
1288        context.setdefault('authors', [FeedParserDict()])
1289        context['authors'][-1][key] = value
1290
1291    def _save_contributor(self, key, value):
1292        context = self._getContext()
1293        context.setdefault('contributors', [FeedParserDict()])
1294        context['contributors'][-1][key] = value
1295
1296    def _sync_author_detail(self, key='author'):
1297        context = self._getContext()
1298        detail = context.get('%ss' % key, [FeedParserDict()])[-1]
1299        if detail:
1300            name = detail.get('name')
1301            email = detail.get('email')
1302            if name and email:
1303                context[key] = u'%s (%s)' % (name, email)
1304            elif name:
1305                context[key] = name
1306            elif email:
1307                context[key] = email
1308        else:
1309            author, email = context.get(key), None
1310            if not author:
1311                return
1312            emailmatch = re.search(ur'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
1313            if emailmatch:
1314                email = emailmatch.group(0)
1315                # probably a better way to do the following, but it passes all the tests
1316                author = author.replace(email, u'')
1317                author = author.replace(u'()', u'')
1318                author = author.replace(u'<>', u'')
1319                author = author.replace(u'&lt;&gt;', u'')
1320                author = author.strip()
1321                if author and (author[0] == u'('):
1322                    author = author[1:]
1323                if author and (author[-1] == u')'):
1324                    author = author[:-1]
1325                author = author.strip()
1326            if author or email:
1327                context.setdefault('%s_detail' % key, detail)
1328            if author:
1329                detail['name'] = author
1330            if email:
1331                detail['email'] = email
1332
1333    def _start_subtitle(self, attrsD):
1334        self.pushContent('subtitle', attrsD, u'text/plain', 1)
1335    _start_tagline = _start_subtitle
1336    _start_itunes_subtitle = _start_subtitle
1337
1338    def _end_subtitle(self):
1339        self.popContent('subtitle')
1340    _end_tagline = _end_subtitle
1341    _end_itunes_subtitle = _end_subtitle
1342
1343    def _start_rights(self, attrsD):
1344        self.pushContent('rights', attrsD, u'text/plain', 1)
1345    _start_dc_rights = _start_rights
1346    _start_copyright = _start_rights
1347
1348    def _end_rights(self):
1349        self.popContent('rights')
1350    _end_dc_rights = _end_rights
1351    _end_copyright = _end_rights
1352
1353    def _start_item(self, attrsD):
1354        self.entries.append(FeedParserDict())
1355        self.push('item', 0)
1356        self.inentry = 1
1357        self.guidislink = 0
1358        self.title_depth = -1
1359        self.psc_chapters_flag = None
1360        id = self._getAttribute(attrsD, 'rdf:about')
1361        if id:
1362            context = self._getContext()
1363            context['id'] = id
1364        self._cdf_common(attrsD)
1365    _start_entry = _start_item
1366
1367    def _end_item(self):
1368        self.pop('item')
1369        self.inentry = 0
1370    _end_entry = _end_item
1371
1372    def _start_dc_language(self, attrsD):
1373        self.push('language', 1)
1374    _start_language = _start_dc_language
1375
1376    def _end_dc_language(self):
1377        self.lang = self.pop('language')
1378    _end_language = _end_dc_language
1379
1380    def _start_dc_publisher(self, attrsD):
1381        self.push('publisher', 1)
1382    _start_webmaster = _start_dc_publisher
1383
1384    def _end_dc_publisher(self):
1385        self.pop('publisher')
1386        self._sync_author_detail('publisher')
1387    _end_webmaster = _end_dc_publisher
1388
1389    def _start_dcterms_valid(self, attrsD):
1390        self.push('validity', 1)
1391
1392    def _end_dcterms_valid(self):
1393        for validity_detail in self.pop('validity').split(';'):
1394            if '=' in validity_detail:
1395                key, value = validity_detail.split('=', 1)
1396                if key == 'start':
1397                    self._save('validity_start', value, overwrite=True)
1398                    self._save('validity_start_parsed', _parse_date(value), overwrite=True)
1399                elif key == 'end':
1400                    self._save('validity_end', value, overwrite=True)
1401                    self._save('validity_end_parsed', _parse_date(value), overwrite=True)
1402
1403    def _start_published(self, attrsD):
1404        self.push('published', 1)
1405    _start_dcterms_issued = _start_published
1406    _start_issued = _start_published
1407    _start_pubdate = _start_published
1408
1409    def _end_published(self):
1410        value = self.pop('published')
1411        self._save('published_parsed', _parse_date(value), overwrite=True)
1412    _end_dcterms_issued = _end_published
1413    _end_issued = _end_published
1414    _end_pubdate = _end_published
1415
1416    def _start_updated(self, attrsD):
1417        self.push('updated', 1)
1418    _start_modified = _start_updated
1419    _start_dcterms_modified = _start_updated
1420    _start_dc_date = _start_updated
1421    _start_lastbuilddate = _start_updated
1422
1423    def _end_updated(self):
1424        value = self.pop('updated')
1425        parsed_value = _parse_date(value)
1426        self._save('updated_parsed', parsed_value, overwrite=True)
1427    _end_modified = _end_updated
1428    _end_dcterms_modified = _end_updated
1429    _end_dc_date = _end_updated
1430    _end_lastbuilddate = _end_updated
1431
1432    def _start_created(self, attrsD):
1433        self.push('created', 1)
1434    _start_dcterms_created = _start_created
1435
1436    def _end_created(self):
1437        value = self.pop('created')
1438        self._save('created_parsed', _parse_date(value), overwrite=True)
1439    _end_dcterms_created = _end_created
1440
1441    def _start_expirationdate(self, attrsD):
1442        self.push('expired', 1)
1443
1444    def _end_expirationdate(self):
1445        self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
1446
1447    # geospatial location, or "where", from georss.org
1448
1449    def _start_georssgeom(self, attrsD):
1450        self.push('geometry', 0)
1451        context = self._getContext()
1452        context['where'] = FeedParserDict()
1453
1454    _start_georss_point = _start_georssgeom
1455    _start_georss_line = _start_georssgeom
1456    _start_georss_polygon = _start_georssgeom
1457    _start_georss_box = _start_georssgeom
1458
1459    def _save_where(self, geometry):
1460        context = self._getContext()
1461        context['where'].update(geometry)
1462
1463    def _end_georss_point(self):
1464        geometry = _parse_georss_point(self.pop('geometry'))
1465        if geometry:
1466            self._save_where(geometry)
1467
1468    def _end_georss_line(self):
1469        geometry = _parse_georss_line(self.pop('geometry'))
1470        if geometry:
1471            self._save_where(geometry)
1472
1473    def _end_georss_polygon(self):
1474        this = self.pop('geometry')
1475        geometry = _parse_georss_polygon(this)
1476        if geometry:
1477            self._save_where(geometry)
1478
1479    def _end_georss_box(self):
1480        geometry = _parse_georss_box(self.pop('geometry'))
1481        if geometry:
1482            self._save_where(geometry)
1483
1484    def _start_where(self, attrsD):
1485        self.push('where', 0)
1486        context = self._getContext()
1487        context['where'] = FeedParserDict()
1488    _start_georss_where = _start_where
1489
1490    def _parse_srs_attrs(self, attrsD):
1491        srsName = attrsD.get('srsname')
1492        try:
1493            srsDimension = int(attrsD.get('srsdimension', '2'))
1494        except ValueError:
1495            srsDimension = 2
1496        context = self._getContext()
1497        context['where']['srsName'] = srsName
1498        context['where']['srsDimension'] = srsDimension
1499
1500    def _start_gml_point(self, attrsD):
1501        self._parse_srs_attrs(attrsD)
1502        self.ingeometry = 1
1503        self.push('geometry', 0)
1504
1505    def _start_gml_linestring(self, attrsD):
1506        self._parse_srs_attrs(attrsD)
1507        self.ingeometry = 'linestring'
1508        self.push('geometry', 0)
1509
1510    def _start_gml_polygon(self, attrsD):
1511        self._parse_srs_attrs(attrsD)
1512        self.push('geometry', 0)
1513
1514    def _start_gml_exterior(self, attrsD):
1515        self.push('geometry', 0)
1516
1517    def _start_gml_linearring(self, attrsD):
1518        self.ingeometry = 'polygon'
1519        self.push('geometry', 0)
1520
1521    def _start_gml_pos(self, attrsD):
1522        self.push('pos', 0)
1523
1524    def _end_gml_pos(self):
1525        this = self.pop('pos')
1526        context = self._getContext()
1527        srsName = context['where'].get('srsName')
1528        srsDimension = context['where'].get('srsDimension', 2)
1529        swap = True
1530        if srsName and "EPSG" in srsName:
1531            epsg = int(srsName.split(":")[-1])
1532            swap = bool(epsg in _geogCS)
1533        geometry = _parse_georss_point(this, swap=swap, dims=srsDimension)
1534        if geometry:
1535            self._save_where(geometry)
1536
1537    def _start_gml_poslist(self, attrsD):
1538        self.push('pos', 0)
1539
1540    def _end_gml_poslist(self):
1541        this = self.pop('pos')
1542        context = self._getContext()
1543        srsName = context['where'].get('srsName')
1544        srsDimension = context['where'].get('srsDimension', 2)
1545        swap = True
1546        if srsName and "EPSG" in srsName:
1547            epsg = int(srsName.split(":")[-1])
1548            swap = bool(epsg in _geogCS)
1549        geometry = _parse_poslist(
1550            this, self.ingeometry, swap=swap, dims=srsDimension)
1551        if geometry:
1552            self._save_where(geometry)
1553
1554    def _end_geom(self):
1555        self.ingeometry = 0
1556        self.pop('geometry')
1557    _end_gml_point = _end_geom
1558    _end_gml_linestring = _end_geom
1559    _end_gml_linearring = _end_geom
1560    _end_gml_exterior = _end_geom
1561    _end_gml_polygon = _end_geom
1562
1563    def _end_where(self):
1564        self.pop('where')
1565    _end_georss_where = _end_where
1566
1567    # end geospatial
1568
1569    def _start_cc_license(self, attrsD):
1570        context = self._getContext()
1571        value = self._getAttribute(attrsD, 'rdf:resource')
1572        attrsD = FeedParserDict()
1573        attrsD['rel'] = u'license'
1574        if value:
1575            attrsD['href']=value
1576        context.setdefault('links', []).append(attrsD)
1577
1578    def _start_creativecommons_license(self, attrsD):
1579        self.push('license', 1)
1580    _start_creativeCommons_license = _start_creativecommons_license
1581
1582    def _end_creativecommons_license(self):
1583        value = self.pop('license')
1584        context = self._getContext()
1585        attrsD = FeedParserDict()
1586        attrsD['rel'] = u'license'
1587        if value:
1588            attrsD['href'] = value
1589        context.setdefault('links', []).append(attrsD)
1590        del context['license']
1591    _end_creativeCommons_license = _end_creativecommons_license
1592
1593    def _addTag(self, term, scheme, label):
1594        context = self._getContext()
1595        tags = context.setdefault('tags', [])
1596        if (not term) and (not scheme) and (not label):
1597            return
1598        value = FeedParserDict(term=term, scheme=scheme, label=label)
1599        if value not in tags:
1600            tags.append(value)
1601
1602    def _start_tags(self, attrsD):
1603        # This is a completely-made up element. Its semantics are determined
1604        # only by a single feed that precipitated bug report 392 on Google Code.
1605        # In short, this is junk code.
1606        self.push('tags', 1)
1607
1608    def _end_tags(self):
1609        for term in self.pop('tags').split(','):
1610            self._addTag(term.strip(), None, None)
1611
1612    def _start_category(self, attrsD):
1613        term = attrsD.get('term')
1614        scheme = attrsD.get('scheme', attrsD.get('domain'))
1615        label = attrsD.get('label')
1616        self._addTag(term, scheme, label)
1617        self.push('category', 1)
1618    _start_dc_subject = _start_category
1619    _start_keywords = _start_category
1620
1621    def _start_media_category(self, attrsD):
1622        attrsD.setdefault('scheme', u'http://search.yahoo.com/mrss/category_schema')
1623        self._start_category(attrsD)
1624
1625    def _end_itunes_keywords(self):
1626        for term in self.pop('itunes_keywords').split(','):
1627            if term.strip():
1628                self._addTag(term.strip(), u'http://www.itunes.com/', None)
1629
1630    def _end_media_keywords(self):
1631        for term in self.pop('media_keywords').split(','):
1632            if term.strip():
1633                self._addTag(term.strip(), None, None)
1634
1635    def _start_itunes_category(self, attrsD):
1636        self._addTag(attrsD.get('text'), u'http://www.itunes.com/', None)
1637        self.push('category', 1)
1638
1639    def _end_category(self):
1640        value = self.pop('category')
1641        if not value:
1642            return
1643        context = self._getContext()
1644        tags = context['tags']
1645        if value and len(tags) and not tags[-1]['term']:
1646            tags[-1]['term'] = value
1647        else:
1648            self._addTag(value, None, None)
1649    _end_dc_subject = _end_category
1650    _end_keywords = _end_category
1651    _end_itunes_category = _end_category
1652    _end_media_category = _end_category
1653
1654    def _start_cloud(self, attrsD):
1655        self._getContext()['cloud'] = FeedParserDict(attrsD)
1656
1657    def _start_link(self, attrsD):
1658        attrsD.setdefault('rel', u'alternate')
1659        if attrsD['rel'] == u'self':
1660            attrsD.setdefault('type', u'application/atom+xml')
1661        else:
1662            attrsD.setdefault('type', u'text/html')
1663        context = self._getContext()
1664        attrsD = self._itsAnHrefDamnIt(attrsD)
1665        if 'href' in attrsD:
1666            attrsD['href'] = self.resolveURI(attrsD['href'])
1667        expectingText = self.infeed or self.inentry or self.insource
1668        context.setdefault('links', [])
1669        if not (self.inentry and self.inimage):
1670            context['links'].append(FeedParserDict(attrsD))
1671        if 'href' in attrsD:
1672            expectingText = 0
1673            if (attrsD.get('rel') == u'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1674                context['link'] = attrsD['href']
1675        else:
1676            self.push('link', expectingText)
1677
1678    def _end_link(self):
1679        value = self.pop('link')
1680
1681    def _start_guid(self, attrsD):
1682        self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1683        self.push('id', 1)
1684    _start_id = _start_guid
1685
1686    def _end_guid(self):
1687        value = self.pop('id')
1688        self._save('guidislink', self.guidislink and 'link' not in self._getContext())
1689        if self.guidislink:
1690            # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1691            # and only if the item doesn't already have a link element
1692            self._save('link', value)
1693    _end_id = _end_guid
1694
1695    def _start_title(self, attrsD):
1696        if self.svgOK:
1697            return self.unknown_starttag('title', attrsD.items())
1698        self.pushContent('title', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
1699    _start_dc_title = _start_title
1700    _start_media_title = _start_title
1701
1702    def _end_title(self):
1703        if self.svgOK:
1704            return
1705        value = self.popContent('title')
1706        if not value:
1707            return
1708        self.title_depth = self.depth
1709    _end_dc_title = _end_title
1710
1711    def _end_media_title(self):
1712        title_depth = self.title_depth
1713        self._end_title()
1714        self.title_depth = title_depth
1715
1716    def _start_description(self, attrsD):
1717        context = self._getContext()
1718        if 'summary' in context:
1719            self._summaryKey = 'content'
1720            self._start_content(attrsD)
1721        else:
1722            self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource)
1723    _start_dc_description = _start_description
1724    _start_media_description = _start_description
1725
1726    def _start_abstract(self, attrsD):
1727        self.pushContent('description', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
1728
1729    def _end_description(self):
1730        if self._summaryKey == 'content':
1731            self._end_content()
1732        else:
1733            value = self.popContent('description')
1734        self._summaryKey = None
1735    _end_abstract = _end_description
1736    _end_dc_description = _end_description
1737    _end_media_description = _end_description
1738
1739    def _start_info(self, attrsD):
1740        self.pushContent('info', attrsD, u'text/plain', 1)
1741    _start_feedburner_browserfriendly = _start_info
1742
1743    def _end_info(self):
1744        self.popContent('info')
1745    _end_feedburner_browserfriendly = _end_info
1746
1747    def _start_generator(self, attrsD):
1748        if attrsD:
1749            attrsD = self._itsAnHrefDamnIt(attrsD)
1750            if 'href' in attrsD:
1751                attrsD['href'] = self.resolveURI(attrsD['href'])
1752        self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1753        self.push('generator', 1)
1754
1755    def _end_generator(self):
1756        value = self.pop('generator')
1757        context = self._getContext()
1758        if 'generator_detail' in context:
1759            context['generator_detail']['name'] = value
1760
1761    def _start_admin_generatoragent(self, attrsD):
1762        self.push('generator', 1)
1763        value = self._getAttribute(attrsD, 'rdf:resource')
1764        if value:
1765            self.elementstack[-1][2].append(value)
1766        self.pop('generator')
1767        self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1768
1769    def _start_admin_errorreportsto(self, attrsD):
1770        self.push('errorreportsto', 1)
1771        value = self._getAttribute(attrsD, 'rdf:resource')
1772        if value:
1773            self.elementstack[-1][2].append(value)
1774        self.pop('errorreportsto')
1775
1776    def _start_summary(self, attrsD):
1777        context = self._getContext()
1778        if 'summary' in context:
1779            self._summaryKey = 'content'
1780            self._start_content(attrsD)
1781        else:
1782            self._summaryKey = 'summary'
1783            self.pushContent(self._summaryKey, attrsD, u'text/plain', 1)
1784    _start_itunes_summary = _start_summary
1785
1786    def _end_summary(self):
1787        if self._summaryKey == 'content':
1788            self._end_content()
1789        else:
1790            self.popContent(self._summaryKey or 'summary')
1791        self._summaryKey = None
1792    _end_itunes_summary = _end_summary
1793
1794    def _start_enclosure(self, attrsD):
1795        attrsD = self._itsAnHrefDamnIt(attrsD)
1796        context = self._getContext()
1797        attrsD['rel'] = u'enclosure'
1798        context.setdefault('links', []).append(FeedParserDict(attrsD))
1799
1800    def _start_source(self, attrsD):
1801        if 'url' in attrsD:
1802            # This means that we're processing a source element from an RSS 2.0 feed
1803            self.sourcedata['href'] = attrsD[u'url']
1804        self.push('source', 1)
1805        self.insource = 1
1806        self.title_depth = -1
1807
1808    def _end_source(self):
1809        self.insource = 0
1810        value = self.pop('source')
1811        if value:
1812            self.sourcedata['title'] = value
1813        self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1814        self.sourcedata.clear()
1815
1816    def _start_content(self, attrsD):
1817        self.pushContent('content', attrsD, u'text/plain', 1)
1818        src = attrsD.get('src')
1819        if src:
1820            self.contentparams['src'] = src
1821        self.push('content', 1)
1822
1823    def _start_body(self, attrsD):
1824        self.pushContent('content', attrsD, u'application/xhtml+xml', 1)
1825    _start_xhtml_body = _start_body
1826
1827    def _start_content_encoded(self, attrsD):
1828        self.pushContent('content', attrsD, u'text/html', 1)
1829    _start_fullitem = _start_content_encoded
1830
1831    def _end_content(self):
1832        copyToSummary = self.mapContentType(self.contentparams.get('type')) in ([u'text/plain'] + self.html_types)
1833        value = self.popContent('content')
1834        if copyToSummary:
1835            self._save('summary', value)
1836
1837    _end_body = _end_content
1838    _end_xhtml_body = _end_content
1839    _end_content_encoded = _end_content
1840    _end_fullitem = _end_content
1841
1842    def _start_itunes_image(self, attrsD):
1843        self.push('itunes_image', 0)
1844        if attrsD.get('href'):
1845            self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1846        elif attrsD.get('url'):
1847            self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')})
1848    _start_itunes_link = _start_itunes_image
1849
1850    def _end_itunes_block(self):
1851        value = self.pop('itunes_block', 0)
1852        self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1853
1854    def _end_itunes_explicit(self):
1855        value = self.pop('itunes_explicit', 0)
1856        # Convert 'yes' -> True, 'clean' to False, and any other value to None
1857        # False and None both evaluate as False, so the difference can be ignored
1858        # by applications that only need to know if the content is explicit.
1859        self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
1860
1861    def _start_media_group(self, attrsD):
1862        # don't do anything, but don't break the enclosed tags either
1863        pass
1864
1865    def _start_media_rating(self, attrsD):
1866        context = self._getContext()
1867        context.setdefault('media_rating', attrsD)
1868        self.push('rating', 1)
1869
1870    def _end_media_rating(self):
1871        rating = self.pop('rating')
1872        if rating is not None and rating.strip():
1873            context = self._getContext()
1874            context['media_rating']['content'] = rating
1875
1876    def _start_media_credit(self, attrsD):
1877        context = self._getContext()
1878        context.setdefault('media_credit', [])
1879        context['media_credit'].append(attrsD)
1880        self.push('credit', 1)
1881
1882    def _end_media_credit(self):
1883        credit = self.pop('credit')
1884        if credit != None and len(credit.strip()) != 0:
1885            context = self._getContext()
1886            context['media_credit'][-1]['content'] = credit
1887
1888    def _start_media_restriction(self, attrsD):
1889        context = self._getContext()
1890        context.setdefault('media_restriction', attrsD)
1891        self.push('restriction', 1)
1892
1893    def _end_media_restriction(self):
1894        restriction = self.pop('restriction')
1895        if restriction != None and len(restriction.strip()) != 0:
1896            context = self._getContext()
1897            context['media_restriction']['content'] = [cc.strip().lower() for cc in restriction.split(' ')]
1898
1899    def _start_media_license(self, attrsD):
1900        context = self._getContext()
1901        context.setdefault('media_license', attrsD)
1902        self.push('license', 1)
1903
1904    def _end_media_license(self):
1905        license = self.pop('license')
1906        if license != None and len(license.strip()) != 0:
1907            context = self._getContext()
1908            context['media_license']['content'] = license
1909
1910    def _start_media_content(self, attrsD):
1911        context = self._getContext()
1912        context.setdefault('media_content', [])
1913        context['media_content'].append(attrsD)
1914
1915    def _start_media_thumbnail(self, attrsD):
1916        context = self._getContext()
1917        context.setdefault('media_thumbnail', [])
1918        self.push('url', 1) # new
1919        context['media_thumbnail'].append(attrsD)
1920
1921    def _end_media_thumbnail(self):
1922        url = self.pop('url')
1923        context = self._getContext()
1924        if url != None and len(url.strip()) != 0:
1925            if 'url' not in context['media_thumbnail'][-1]:
1926                context['media_thumbnail'][-1]['url'] = url
1927
1928    def _start_media_player(self, attrsD):
1929        self.push('media_player', 0)
1930        self._getContext()['media_player'] = FeedParserDict(attrsD)
1931
1932    def _end_media_player(self):
1933        value = self.pop('media_player')
1934        context = self._getContext()
1935        context['media_player']['content'] = value
1936
1937    def _start_newlocation(self, attrsD):
1938        self.push('newlocation', 1)
1939
1940    def _end_newlocation(self):
1941        url = self.pop('newlocation')
1942        context = self._getContext()
1943        # don't set newlocation if the context isn't right
1944        if context is not self.feeddata:
1945            return
1946        context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
1947
1948    def _start_psc_chapters(self, attrsD):
1949        if self.psc_chapters_flag is None:
1950            # Transition from None -> True
1951            self.psc_chapters_flag = True
1952            attrsD['chapters'] = []
1953            self._getContext()['psc_chapters'] = FeedParserDict(attrsD)
1954
1955    def _end_psc_chapters(self):
1956        # Transition from True -> False
1957        self.psc_chapters_flag = False
1958
1959    def _start_psc_chapter(self, attrsD):
1960        if self.psc_chapters_flag:
1961            start = self._getAttribute(attrsD, 'start')
1962            attrsD['start_parsed'] = _parse_psc_chapter_start(start)
1963
1964            context = self._getContext()['psc_chapters']
1965            context['chapters'].append(FeedParserDict(attrsD))
1966
1967
1968if _XML_AVAILABLE:
1969    class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1970        def __init__(self, baseuri, baselang, encoding):
1971            xml.sax.handler.ContentHandler.__init__(self)
1972            _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1973            self.bozo = 0
1974            self.exc = None
1975            self.decls = {}
1976
1977        def startPrefixMapping(self, prefix, uri):
1978            if not uri:
1979                return
1980            # Jython uses '' instead of None; standardize on None
1981            prefix = prefix or None
1982            self.trackNamespace(prefix, uri)
1983            if prefix and uri == 'http://www.w3.org/1999/xlink':
1984                self.decls['xmlns:' + prefix] = uri
1985
1986        def startElementNS(self, name, qname, attrs):
1987            namespace, localname = name
1988            lowernamespace = str(namespace or '').lower()
1989            if lowernamespace.find(u'backend.userland.com/rss') <> -1:
1990                # match any backend.userland.com namespace
1991                namespace = u'http://backend.userland.com/rss'
1992                lowernamespace = namespace
1993            if qname and qname.find(':') > 0:
1994                givenprefix = qname.split(':')[0]
1995            else:
1996                givenprefix = None
1997            prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1998            if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespacesInUse:
1999                raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
2000            localname = str(localname).lower()
2001
2002            # qname implementation is horribly broken in Python 2.1 (it
2003            # doesn't report any), and slightly broken in Python 2.2 (it
2004            # doesn't report the xml: namespace). So we match up namespaces
2005            # with a known list first, and then possibly override them with
2006            # the qnames the SAX parser gives us (if indeed it gives us any
2007            # at all).  Thanks to MatejC for helping me test this and
2008            # tirelessly telling me that it didn't work yet.
2009            attrsD, self.decls = self.decls, {}
2010            if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
2011                attrsD['xmlns']=namespace
2012            if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
2013                attrsD['xmlns']=namespace
2014
2015            if prefix:
2016                localname = prefix.lower() + ':' + localname
2017            elif namespace and not qname: #Expat
2018                for name,value in self.namespacesInUse.items():
2019                    if name and value == namespace:
2020                        localname = name + ':' + localname
2021                        break
2022
2023            for (namespace, attrlocalname), attrvalue in attrs.items():
2024                lowernamespace = (namespace or '').lower()
2025                prefix = self._matchnamespaces.get(lowernamespace, '')
2026                if prefix:
2027                    attrlocalname = prefix + ':' + attrlocalname
2028                attrsD[str(attrlocalname).lower()] = attrvalue
2029            for qname in attrs.getQNames():
2030                attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
2031            localname = str(localname).lower()
2032            self.unknown_starttag(localname, attrsD.items())
2033
2034        def characters(self, text):
2035            self.handle_data(text)
2036
2037        def endElementNS(self, name, qname):
2038            namespace, localname = name
2039            lowernamespace = str(namespace or '').lower()
2040            if qname and qname.find(':') > 0:
2041                givenprefix = qname.split(':')[0]
2042            else:
2043                givenprefix = ''
2044            prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
2045            if prefix:
2046                localname = prefix + ':' + localname
2047            elif namespace and not qname: #Expat
2048                for name,value in self.namespacesInUse.items():
2049                    if name and value == namespace:
2050                        localname = name + ':' + localname
2051                        break
2052            localname = str(localname).lower()
2053            self.unknown_endtag(localname)
2054
2055        def error(self, exc):
2056            self.bozo = 1
2057            self.exc = exc
2058
2059        # drv_libxml2 calls warning() in some cases
2060        warning = error
2061
2062        def fatalError(self, exc):
2063            self.error(exc)
2064            raise exc
2065
2066class _BaseHTMLProcessor(sgmllib.SGMLParser):
2067    special = re.compile('''[<>'"]''')
2068    bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
2069    elements_no_end_tag = set([
2070      'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
2071      'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
2072      'source', 'track', 'wbr'
2073    ])
2074
2075    def __init__(self, encoding, _type):
2076        self.encoding = encoding
2077        self._type = _type
2078        sgmllib.SGMLParser.__init__(self)
2079
2080    def reset(self):
2081        self.pieces = []
2082        sgmllib.SGMLParser.reset(self)
2083
2084    def _shorttag_replace(self, match):
2085        tag = match.group(1)
2086        if tag in self.elements_no_end_tag:
2087            return '<' + tag + ' />'
2088        else:
2089            return '<' + tag + '></' + tag + '>'
2090
2091    # By declaring these methods and overriding their compiled code
2092    # with the code from sgmllib, the original code will execute in
2093    # feedparser's scope instead of sgmllib's. This means that the
2094    # `tagfind` and `charref` regular expressions will be found as
2095    # they're declared above, not as they're declared in sgmllib.
2096    def goahead(self, i):
2097        pass
2098    goahead.func_code = sgmllib.SGMLParser.goahead.func_code
2099
2100    def __parse_starttag(self, i):
2101        pass
2102    __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
2103
2104    def parse_starttag(self,i):
2105        j = self.__parse_starttag(i)
2106        if self._type == 'application/xhtml+xml':
2107            if j>2 and self.rawdata[j-2:j]=='/>':
2108                self.unknown_endtag(self.lasttag)
2109        return j
2110
2111    def feed(self, data):
2112        data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
2113        data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
2114        data = data.replace('&#39;', "'")
2115        data = data.replace('&#34;', '"')
2116        try:
2117            bytes
2118            if bytes is str:
2119                raise NameError
2120            self.encoding = self.encoding + u'_INVALID_PYTHON_3'
2121        except NameError:
2122            if self.encoding and isinstance(data, unicode):
2123                data = data.encode(self.encoding)
2124        sgmllib.SGMLParser.feed(self, data)
2125        sgmllib.SGMLParser.close(self)
2126
2127    def normalize_attrs(self, attrs):
2128        if not attrs:
2129            return attrs
2130        # utility method to be called by descendants
2131        attrs = dict([(k.lower(), v) for k, v in attrs]).items()
2132        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
2133        attrs.sort()
2134        return attrs
2135
2136    def unknown_starttag(self, tag, attrs):
2137        # called for each start tag
2138        # attrs is a list of (attr, value) tuples
2139        # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
2140        uattrs = []
2141        strattrs=''
2142        if attrs:
2143            for key, value in attrs:
2144                value=value.replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
2145                value = self.bare_ampersand.sub("&amp;", value)
2146                # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
2147                if not isinstance(value, unicode):
2148                    value = value.decode(self.encoding, 'ignore')
2149                try:
2150                    # Currently, in Python 3 the key is already a str, and cannot be decoded again
2151                    uattrs.append((unicode(key, self.encoding), value))
2152                except TypeError:
2153                    uattrs.append((key, value))
2154            strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
2155            if self.encoding:
2156                try:
2157                    strattrs = strattrs.encode(self.encoding)
2158                except (UnicodeEncodeError, LookupError):
2159                    pass
2160        if tag in self.elements_no_end_tag:
2161            self.pieces.append('<%s%s />' % (tag, strattrs))
2162        else:
2163            self.pieces.append('<%s%s>' % (tag, strattrs))
2164
2165    def unknown_endtag(self, tag):
2166        # called for each end tag, e.g. for </pre>, tag will be 'pre'
2167        # Reconstruct the original end tag.
2168        if tag not in self.elements_no_end_tag:
2169            self.pieces.append("</%s>" % tag)
2170
2171    def handle_charref(self, ref):
2172        # called for each character reference, e.g. for '&#160;', ref will be '160'
2173        # Reconstruct the original character reference.
2174        ref = ref.lower()
2175        if ref.startswith('x'):
2176            value = int(ref[1:], 16)
2177        else:
2178            value = int(ref)
2179
2180        if value in _cp1252:
2181            self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
2182        else:
2183            self.pieces.append('&#%s;' % ref)
2184
2185    def handle_entityref(self, ref):
2186        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
2187        # Reconstruct the original entity reference.
2188        if ref in name2codepoint or ref == 'apos':
2189            self.pieces.append('&%s;' % ref)
2190        else:
2191            self.pieces.append('&amp;%s' % ref)
2192
2193    def handle_data(self, text):
2194        # called for each block of plain text, i.e. outside of any tag and
2195        # not containing any character or entity references
2196        # Store the original text verbatim.
2197        self.pieces.append(text)
2198
2199    def handle_comment(self, text):
2200        # called for each HTML comment, e.g. <!-- insert Javascript code here -->
2201        # Reconstruct the original comment.
2202        self.pieces.append('<!--%s-->' % text)
2203
2204    def handle_pi(self, text):
2205        # called for each processing instruction, e.g. <?instruction>
2206        # Reconstruct original processing instruction.
2207        self.pieces.append('<?%s>' % text)
2208
2209    def handle_decl(self, text):
2210        # called for the DOCTYPE, if present, e.g.
2211        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
2212        #     "http://www.w3.org/TR/html4/loose.dtd">
2213        # Reconstruct original DOCTYPE
2214        self.pieces.append('<!%s>' % text)
2215
2216    _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
2217    def _scan_name(self, i, declstartpos):
2218        rawdata = self.rawdata
2219        n = len(rawdata)
2220        if i == n:
2221            return None, -1
2222        m = self._new_declname_match(rawdata, i)
2223        if m:
2224            s = m.group()
2225            name = s.strip()
2226            if (i + len(s)) == n:
2227                return None, -1  # end of buffer
2228            return name.lower(), m.end()
2229        else:
2230            self.handle_data(rawdata)
2231#            self.updatepos(declstartpos, i)
2232            return None, -1
2233
2234    def convert_charref(self, name):
2235        return '&#%s;' % name
2236
2237    def convert_entityref(self, name):
2238        return '&%s;' % name
2239
2240    def output(self):
2241        '''Return processed HTML as a single string'''
2242        return ''.join([str(p) for p in self.pieces])
2243
2244    def parse_declaration(self, i):
2245        try:
2246            return sgmllib.SGMLParser.parse_declaration(self, i)
2247        except sgmllib.SGMLParseError:
2248            # escape the doctype declaration and continue parsing
2249            self.handle_data('&lt;')
2250            return i+1
2251
2252class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
2253    def __init__(self, baseuri, baselang, encoding, entities):
2254        sgmllib.SGMLParser.__init__(self)
2255        _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
2256        _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
2257        self.entities=entities
2258
2259    def decodeEntities(self, element, data):
2260        data = data.replace('&#60;', '&lt;')
2261        data = data.replace('&#x3c;', '&lt;')
2262        data = data.replace('&#x3C;', '&lt;')
2263        data = data.replace('&#62;', '&gt;')
2264        data = data.replace('&#x3e;', '&gt;')
2265        data = data.replace('&#x3E;', '&gt;')
2266        data = data.replace('&#38;', '&amp;')
2267        data = data.replace('&#x26;', '&amp;')
2268        data = data.replace('&#34;', '&quot;')
2269        data = data.replace('&#x22;', '&quot;')
2270        data = data.replace('&#39;', '&apos;')
2271        data = data.replace('&#x27;', '&apos;')
2272        if not self.contentparams.get('type', u'xml').endswith(u'xml'):
2273            data = data.replace('&lt;', '<')
2274            data = data.replace('&gt;', '>')
2275            data = data.replace('&amp;', '&')
2276            data = data.replace('&quot;', '"')
2277            data = data.replace('&apos;', "'")
2278            data = data.replace('&#x2f;', '/')
2279            data = data.replace('&#x2F;', '/')
2280        return data
2281
2282    def strattrs(self, attrs):
2283        return ''.join([' %s="%s"' % (n,v.replace('"','&quot;')) for n,v in attrs])
2284
2285class _RelativeURIResolver(_BaseHTMLProcessor):
2286    relative_uris = set([('a', 'href'),
2287                     ('applet', 'codebase'),
2288                     ('area', 'href'),
2289                     ('audio', 'src'),
2290                     ('blockquote', 'cite'),
2291                     ('body', 'background'),
2292                     ('del', 'cite'),
2293                     ('form', 'action'),
2294                     ('frame', 'longdesc'),
2295                     ('frame', 'src'),
2296                     ('iframe', 'longdesc'),
2297                     ('iframe', 'src'),
2298                     ('head', 'profile'),
2299                     ('img', 'longdesc'),
2300                     ('img', 'src'),
2301                     ('img', 'usemap'),
2302                     ('input', 'src'),
2303                     ('input', 'usemap'),
2304                     ('ins', 'cite'),
2305                     ('link', 'href'),
2306                     ('object', 'classid'),
2307                     ('object', 'codebase'),
2308                     ('object', 'data'),
2309                     ('object', 'usemap'),
2310                     ('q', 'cite'),
2311                     ('script', 'src'),
2312                     ('source', 'src'),
2313                     ('video', 'poster'),
2314                     ('video', 'src')])
2315
2316    def __init__(self, baseuri, encoding, _type):
2317        _BaseHTMLProcessor.__init__(self, encoding, _type)
2318        self.baseuri = baseuri
2319
2320    def resolveURI(self, uri):
2321        return _makeSafeAbsoluteURI(self.baseuri, uri.strip())
2322
2323    def unknown_starttag(self, tag, attrs):
2324        attrs = self.normalize_attrs(attrs)
2325        attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
2326        _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
2327
2328def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
2329    if not _SGML_AVAILABLE:
2330        return htmlSource
2331
2332    p = _RelativeURIResolver(baseURI, encoding, _type)
2333    p.feed(htmlSource)
2334    return p.output()
2335
2336def _makeSafeAbsoluteURI(base, rel=None):
2337    # bail if ACCEPTABLE_URI_SCHEMES is empty
2338    if not ACCEPTABLE_URI_SCHEMES:
2339        return _urljoin(base, rel or u'')
2340    if not base:
2341        return rel or u''
2342    if not rel:
2343        try:
2344            scheme = urlparse.urlparse(base)[0]
2345        except ValueError:
2346            return u''
2347        if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
2348            return base
2349        return u''
2350    uri = _urljoin(base, rel)
2351    if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
2352        return u''
2353    return uri
2354
2355class _HTMLSanitizer(_BaseHTMLProcessor):
2356    acceptable_elements = set(['a', 'abbr', 'acronym', 'address', 'area',
2357        'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
2358        'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
2359        'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
2360        'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
2361        'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
2362        'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
2363        'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
2364        'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
2365        'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
2366        'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
2367        'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
2368        'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'])
2369
2370    acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey',
2371      'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
2372      'background', 'balance', 'bgcolor', 'bgproperties', 'border',
2373      'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
2374      'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
2375      'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
2376      'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
2377      'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
2378      'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
2379      'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
2380      'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
2381      'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
2382      'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
2383      'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
2384      'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
2385      'poster', 'pqg', 'preload', 'prompt', 'radiogroup', 'readonly', 'rel',
2386      'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing',
2387      'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span',
2388      'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target',
2389      'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
2390      'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
2391      'width', 'wrap', 'xml:lang'])
2392
2393    unacceptable_elements_with_end_tag = set(['script', 'applet', 'style'])
2394
2395    acceptable_css_properties = set(['azimuth', 'background-color',
2396      'border-bottom-color', 'border-collapse', 'border-color',
2397      'border-left-color', 'border-right-color', 'border-top-color', 'clear',
2398      'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
2399      'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
2400      'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
2401      'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
2402      'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
2403      'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
2404      'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
2405      'white-space', 'width'])
2406
2407    # survey of common keywords found in feeds
2408    acceptable_css_keywords = set(['auto', 'aqua', 'black', 'block', 'blue',
2409      'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
2410      'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
2411      'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
2412      'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
2413      'transparent', 'underline', 'white', 'yellow'])
2414
2415    valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
2416      '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
2417
2418    mathml_elements = set([
2419        'annotation',
2420        'annotation-xml',
2421        'maction',
2422        'maligngroup',
2423        'malignmark',
2424        'math',
2425        'menclose',
2426        'merror',
2427        'mfenced',
2428        'mfrac',
2429        'mglyph',
2430        'mi',
2431        'mlabeledtr',
2432        'mlongdiv',
2433        'mmultiscripts',
2434        'mn',
2435        'mo',
2436        'mover',
2437        'mpadded',
2438        'mphantom',
2439        'mprescripts',
2440        'mroot',
2441        'mrow',
2442        'ms',
2443        'mscarries',
2444        'mscarry',
2445        'msgroup',
2446        'msline',
2447        'mspace',
2448        'msqrt',
2449        'msrow',
2450        'mstack',
2451        'mstyle',
2452        'msub',
2453        'msubsup',
2454        'msup',
2455        'mtable',
2456        'mtd',
2457        'mtext',
2458        'mtr',
2459        'munder',
2460        'munderover',
2461        'none',
2462        'semantics',
2463    ])
2464
2465    mathml_attributes = set([
2466        'accent',
2467        'accentunder',
2468        'actiontype',
2469        'align',
2470        'alignmentscope',
2471        'altimg',
2472        'altimg-height',
2473        'altimg-valign',
2474        'altimg-width',
2475        'alttext',
2476        'bevelled',
2477        'charalign',
2478        'close',
2479        'columnalign',
2480        'columnlines',
2481        'columnspacing',
2482        'columnspan',
2483        'columnwidth',
2484        'crossout',
2485        'decimalpoint',
2486        'denomalign',
2487        'depth',
2488        'dir',
2489        'display',
2490        'displaystyle',
2491        'edge',
2492        'encoding',
2493        'equalcolumns',
2494        'equalrows',
2495        'fence',
2496        'fontstyle',
2497        'fontweight',
2498        'form',
2499        'frame',
2500        'framespacing',
2501        'groupalign',
2502        'height',
2503        'href',
2504        'id',
2505        'indentalign',
2506        'indentalignfirst',
2507        'indentalignlast',
2508        'indentshift',
2509        'indentshiftfirst',
2510        'indentshiftlast',
2511        'indenttarget',
2512        'infixlinebreakstyle',
2513        'largeop',
2514        'length',
2515        'linebreak',
2516        'linebreakmultchar',
2517        'linebreakstyle',
2518        'lineleading',
2519        'linethickness',
2520        'location',
2521        'longdivstyle',
2522        'lquote',
2523        'lspace',
2524        'mathbackground',
2525        'mathcolor',
2526        'mathsize',
2527        'mathvariant',
2528        'maxsize',
2529        'minlabelspacing',
2530        'minsize',
2531        'movablelimits',
2532        'notation',
2533        'numalign',
2534        'open',
2535        'other',
2536        'overflow',
2537        'position',
2538        'rowalign',
2539        'rowlines',
2540        'rowspacing',
2541        'rowspan',
2542        'rquote',
2543        'rspace',
2544        'scriptlevel',
2545        'scriptminsize',
2546        'scriptsizemultiplier',
2547        'selection',
2548        'separator',
2549        'separators',
2550        'shift',
2551        'side',
2552        'src',
2553        'stackalign',
2554        'stretchy',
2555        'subscriptshift',
2556        'superscriptshift',
2557        'symmetric',
2558        'voffset',
2559        'width',
2560        'xlink:href',
2561        'xlink:show',
2562        'xlink:type',
2563        'xmlns',
2564        'xmlns:xlink',
2565    ])
2566
2567    # svgtiny - foreignObject + linearGradient + radialGradient + stop
2568    svg_elements = set(['a', 'animate', 'animateColor', 'animateMotion',
2569      'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
2570      'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
2571      'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
2572      'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
2573      'svg', 'switch', 'text', 'title', 'tspan', 'use'])
2574
2575    # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
2576    svg_attributes = set(['accent-height', 'accumulate', 'additive', 'alphabetic',
2577       'arabic-form', 'ascent', 'attributeName', 'attributeType',
2578       'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
2579       'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
2580       'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
2581       'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
2582       'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
2583       'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
2584       'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
2585       'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
2586       'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
2587       'min', 'name', 'offset', 'opacity', 'orient', 'origin',
2588       'overline-position', 'overline-thickness', 'panose-1', 'path',
2589       'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
2590       'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
2591       'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
2592       'stop-color', 'stop-opacity', 'strikethrough-position',
2593       'strikethrough-thickness', 'stroke', 'stroke-dasharray',
2594       'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
2595       'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
2596       'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
2597       'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
2598       'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
2599       'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
2600       'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
2601       'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
2602       'y2', 'zoomAndPan'])
2603
2604    svg_attr_map = None
2605    svg_elem_map = None
2606
2607    acceptable_svg_properties = set([ 'fill', 'fill-opacity', 'fill-rule',
2608      'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
2609      'stroke-opacity'])
2610
2611    def reset(self):
2612        _BaseHTMLProcessor.reset(self)
2613        self.unacceptablestack = 0
2614        self.mathmlOK = 0
2615        self.svgOK = 0
2616
2617    def unknown_starttag(self, tag, attrs):
2618        acceptable_attributes = self.acceptable_attributes
2619        keymap = {}
2620        if not tag in self.acceptable_elements or self.svgOK:
2621            if tag in self.unacceptable_elements_with_end_tag:
2622                self.unacceptablestack += 1
2623
2624            # add implicit namespaces to html5 inline svg/mathml
2625            if self._type.endswith('html'):
2626                if not dict(attrs).get('xmlns'):
2627                    if tag=='svg':
2628                        attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
2629                    if tag=='math':
2630                        attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
2631
2632            # not otherwise acceptable, perhaps it is MathML or SVG?
2633            if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
2634                self.mathmlOK += 1
2635            if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
2636                self.svgOK += 1
2637
2638            # chose acceptable attributes based on tag class, else bail
2639            if  self.mathmlOK and tag in self.mathml_elements:
2640                acceptable_attributes = self.mathml_attributes
2641            elif self.svgOK and tag in self.svg_elements:
2642                # for most vocabularies, lowercasing is a good idea.  Many
2643                # svg elements, however, are camel case
2644                if not self.svg_attr_map:
2645                    lower=[attr.lower() for attr in self.svg_attributes]
2646                    mix=[a for a in self.svg_attributes if a not in lower]
2647                    self.svg_attributes = lower
2648                    self.svg_attr_map = dict([(a.lower(),a) for a in mix])
2649
2650                    lower=[attr.lower() for attr in self.svg_elements]
2651                    mix=[a for a in self.svg_elements if a not in lower]
2652                    self.svg_elements = lower
2653                    self.svg_elem_map = dict([(a.lower(),a) for a in mix])
2654                acceptable_attributes = self.svg_attributes
2655                tag = self.svg_elem_map.get(tag,tag)
2656                keymap = self.svg_attr_map
2657            elif not tag in self.acceptable_elements:
2658                return
2659
2660        # declare xlink namespace, if needed
2661        if self.mathmlOK or self.svgOK:
2662            if filter(lambda (n,v): n.startswith('xlink:'),attrs):
2663                if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
2664                    attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
2665
2666        clean_attrs = []
2667        for key, value in self.normalize_attrs(attrs):
2668            if key in acceptable_attributes:
2669                key=keymap.get(key,key)
2670                # make sure the uri uses an acceptable uri scheme
2671                if key == u'href':
2672                    value = _makeSafeAbsoluteURI(value)
2673                clean_attrs.append((key,value))
2674            elif key=='style':
2675                clean_value = self.sanitize_style(value)
2676                if clean_value:
2677                    clean_attrs.append((key,clean_value))
2678        _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
2679
2680    def unknown_endtag(self, tag):
2681        if not tag in self.acceptable_elements:
2682            if tag in self.unacceptable_elements_with_end_tag:
2683                self.unacceptablestack -= 1
2684            if self.mathmlOK and tag in self.mathml_elements:
2685                if tag == 'math' and self.mathmlOK:
2686                    self.mathmlOK -= 1
2687            elif self.svgOK and tag in self.svg_elements:
2688                tag = self.svg_elem_map.get(tag,tag)
2689                if tag == 'svg' and self.svgOK:
2690                    self.svgOK -= 1
2691            else:
2692                return
2693        _BaseHTMLProcessor.unknown_endtag(self, tag)
2694
2695    def handle_pi(self, text):
2696        pass
2697
2698    def handle_decl(self, text):
2699        pass
2700
2701    def handle_data(self, text):
2702        if not self.unacceptablestack:
2703            _BaseHTMLProcessor.handle_data(self, text)
2704
2705    def sanitize_style(self, style):
2706        # disallow urls
2707        style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
2708
2709        # gauntlet
2710        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
2711            return ''
2712        # This replaced a regexp that used re.match and was prone to pathological back-tracking.
2713        if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
2714            return ''
2715
2716        clean = []
2717        for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
2718            if not value:
2719                continue
2720            if prop.lower() in self.acceptable_css_properties:
2721                clean.append(prop + ': ' + value + ';')
2722            elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
2723                for keyword in value.split():
2724                    if not keyword in self.acceptable_css_keywords and \
2725                        not self.valid_css_values.match(keyword):
2726                        break
2727                else:
2728                    clean.append(prop + ': ' + value + ';')
2729            elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
2730                clean.append(prop + ': ' + value + ';')
2731
2732        return ' '.join(clean)
2733
2734    def parse_comment(self, i, report=1):
2735        ret = _BaseHTMLProcessor.parse_comment(self, i, report)
2736        if ret >= 0:
2737            return ret
2738        # if ret == -1, this may be a malicious attempt to circumvent
2739        # sanitization, or a page-destroying unclosed comment
2740        match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
2741        if match:
2742            return match.end()
2743        # unclosed comment; deliberately fail to handle_data()
2744        return len(self.rawdata)
2745
2746
2747def _sanitizeHTML(htmlSource, encoding, _type):
2748    if not _SGML_AVAILABLE:
2749        return htmlSource
2750    p = _HTMLSanitizer(encoding, _type)
2751    htmlSource = htmlSource.replace('<![CDATA[', '&lt;![CDATA[')
2752    p.feed(htmlSource)
2753    data = p.output()
2754    data = data.strip().replace('\r\n', '\n')
2755    return data
2756
2757class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
2758    def http_error_default(self, req, fp, code, msg, headers):
2759        # The default implementation just raises HTTPError.
2760        # Forget that.
2761        fp.status = code
2762        return fp
2763
2764    def http_error_301(self, req, fp, code, msg, hdrs):
2765        result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp,
2766                                                            code, msg, hdrs)
2767        result.status = code
2768        result.newurl = result.geturl()
2769        return result
2770    # The default implementations in urllib2.HTTPRedirectHandler
2771    # are identical, so hardcoding a http_error_301 call above
2772    # won't affect anything
2773    http_error_300 = http_error_301
2774    http_error_302 = http_error_301
2775    http_error_303 = http_error_301
2776    http_error_307 = http_error_301
2777
2778    def http_error_401(self, req, fp, code, msg, headers):
2779        # Check if
2780        # - server requires digest auth, AND
2781        # - we tried (unsuccessfully) with basic auth, AND
2782        # If all conditions hold, parse authentication information
2783        # out of the Authorization header we sent the first time
2784        # (for the username and password) and the WWW-Authenticate
2785        # header the server sent back (for the realm) and retry
2786        # the request with the appropriate digest auth headers instead.
2787        # This evil genius hack has been brought to you by Aaron Swartz.
2788        host = urlparse.urlparse(req.get_full_url())[1]
2789        if base64 is None or 'Authorization' not in req.headers \
2790                          or 'WWW-Authenticate' not in headers:
2791            return self.http_error_default(req, fp, code, msg, headers)
2792        auth = _base64decode(req.headers['Authorization'].split(' ')[1])
2793        user, passw = auth.split(':')
2794        realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
2795        self.add_password(realm, host, user, passw)
2796        retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
2797        self.reset_retry_count()
2798        return retry
2799
2800def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
2801    """URL, filename, or string --> stream
2802
2803    This function lets you define parsers that take any input source
2804    (URL, pathname to local or network file, or actual data as a string)
2805    and deal with it in a uniform manner.  Returned object is guaranteed
2806    to have all the basic stdio read methods (read, readline, readlines).
2807    Just .close() the object when you're done with it.
2808
2809    If the etag argument is supplied, it will be used as the value of an
2810    If-None-Match request header.
2811
2812    If the modified argument is supplied, it can be a tuple of 9 integers
2813    (as returned by gmtime() in the standard Python time module) or a date
2814    string in any format supported by feedparser. Regardless, it MUST
2815    be in GMT (Greenwich Mean Time). It will be reformatted into an
2816    RFC 1123-compliant date and used as the value of an If-Modified-Since
2817    request header.
2818
2819    If the agent argument is supplied, it will be used as the value of a
2820    User-Agent request header.
2821
2822    If the referrer argument is supplied, it will be used as the value of a
2823    Referer[sic] request header.
2824
2825    If handlers is supplied, it is a list of handlers used to build a
2826    urllib2 opener.
2827
2828    if request_headers is supplied it is a dictionary of HTTP request headers
2829    that will override the values generated by FeedParser.
2830
2831    :return: A :class:`StringIO.StringIO` or :class:`io.BytesIO`.
2832    """
2833
2834    if hasattr(url_file_stream_or_string, 'read'):
2835        return url_file_stream_or_string
2836
2837    if isinstance(url_file_stream_or_string, basestring) \
2838       and urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
2839        # Deal with the feed URI scheme
2840        if url_file_stream_or_string.startswith('feed:http'):
2841            url_file_stream_or_string = url_file_stream_or_string[5:]
2842        elif url_file_stream_or_string.startswith('feed:'):
2843            url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
2844        if not agent:
2845            agent = USER_AGENT
2846        # Test for inline user:password credentials for HTTP basic auth
2847        auth = None
2848        if base64 and not url_file_stream_or_string.startswith('ftp:'):
2849            urltype, rest = urllib.splittype(url_file_stream_or_string)
2850            realhost, rest = urllib.splithost(rest)
2851            if realhost:
2852                user_passwd, realhost = urllib.splituser(realhost)
2853                if user_passwd:
2854                    url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
2855                    auth = base64.standard_b64encode(user_passwd).strip()
2856
2857        # iri support
2858        if isinstance(url_file_stream_or_string, unicode):
2859            url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string)
2860
2861        # try to open with urllib2 (to use optional headers)
2862        request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
2863        opener = urllib2.build_opener(*tuple(handlers + [_FeedURLHandler()]))
2864        opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
2865        try:
2866            return opener.open(request)
2867        finally:
2868            opener.close() # JohnD
2869
2870    # try to open with native open function (if url_file_stream_or_string is a filename)
2871    try:
2872        return open(url_file_stream_or_string, 'rb')
2873    except (IOError, UnicodeEncodeError, TypeError):
2874        # if url_file_stream_or_string is a unicode object that
2875        # cannot be converted to the encoding returned by
2876        # sys.getfilesystemencoding(), a UnicodeEncodeError
2877        # will be thrown
2878        # If url_file_stream_or_string is a string that contains NULL
2879        # (such as an XML document encoded in UTF-32), TypeError will
2880        # be thrown.
2881        pass
2882
2883    # treat url_file_stream_or_string as string
2884    if isinstance(url_file_stream_or_string, unicode):
2885        return _StringIO(url_file_stream_or_string.encode('utf-8'))
2886    return _StringIO(url_file_stream_or_string)
2887
2888def _convert_to_idn(url):
2889    """Convert a URL to IDN notation"""
2890    # this function should only be called with a unicode string
2891    # strategy: if the host cannot be encoded in ascii, then
2892    # it'll be necessary to encode it in idn form
2893    parts = list(urlparse.urlsplit(url))
2894    try:
2895        parts[1].encode('ascii')
2896    except UnicodeEncodeError:
2897        # the url needs to be converted to idn notation
2898        host = parts[1].rsplit(':', 1)
2899        newhost = []
2900        port = u''
2901        if len(host) == 2:
2902            port = host.pop()
2903        for h in host[0].split('.'):
2904            newhost.append(h.encode('idna').decode('utf-8'))
2905        parts[1] = '.'.join(newhost)
2906        if port:
2907            parts[1] += ':' + port
2908        return urlparse.urlunsplit(parts)
2909    else:
2910        return url
2911
2912def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
2913    request = urllib2.Request(url)
2914    request.add_header('User-Agent', agent)
2915    if etag:
2916        request.add_header('If-None-Match', etag)
2917    if isinstance(modified, basestring):
2918        modified = _parse_date(modified)
2919    elif isinstance(modified, datetime.datetime):
2920        modified = modified.utctimetuple()
2921    if modified:
2922        # format into an RFC 1123-compliant timestamp. We can't use
2923        # time.strftime() since the %a and %b directives can be affected
2924        # by the current locale, but RFC 2616 states that dates must be
2925        # in English.
2926        short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
2927        months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
2928        request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
2929    if referrer:
2930        request.add_header('Referer', referrer)
2931    if gzip and zlib:
2932        request.add_header('Accept-encoding', 'gzip, deflate')
2933    elif gzip:
2934        request.add_header('Accept-encoding', 'gzip')
2935    elif zlib:
2936        request.add_header('Accept-encoding', 'deflate')
2937    else:
2938        request.add_header('Accept-encoding', '')
2939    if auth:
2940        request.add_header('Authorization', 'Basic %s' % auth)
2941    if ACCEPT_HEADER:
2942        request.add_header('Accept', ACCEPT_HEADER)
2943    # use this for whatever -- cookies, special headers, etc
2944    # [('Cookie','Something'),('x-special-header','Another Value')]
2945    for header_name, header_value in request_headers.items():
2946        request.add_header(header_name, header_value)
2947    request.add_header('A-IM', 'feed') # RFC 3229 support
2948    return request
2949
2950def _parse_psc_chapter_start(start):
2951    FORMAT = r'^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$'
2952
2953    m = re.compile(FORMAT).match(start)
2954    if m is None:
2955        return None
2956
2957    _, h, m, s, _, ms = m.groups()
2958    h, m, s, ms = (int(h or 0), int(m), int(s), int(ms or 0))
2959    return datetime.timedelta(0, h*60*60 + m*60 + s, ms*1000)
2960
2961_date_handlers = []
2962def registerDateHandler(func):
2963    '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
2964    _date_handlers.insert(0, func)
2965
2966# ISO-8601 date parsing routines written by Fazal Majid.
2967# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2968# parser is beyond the scope of feedparser and would be a worthwhile addition
2969# to the Python library.
2970# A single regular expression cannot parse ISO 8601 date formats into groups
2971# as the standard is highly irregular (for instance is 030104 2003-01-04 or
2972# 0301-04-01), so we use templates instead.
2973# Please note the order in templates is significant because we need a
2974# greedy match.
2975_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
2976                'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
2977                '-YY-?MM', '-OOO', '-YY',
2978                '--MM-?DD', '--MM',
2979                '---DD',
2980                'CC', '']
2981_iso8601_re = [
2982    tmpl.replace(
2983    'YYYY', r'(?P<year>\d{4})').replace(
2984    'YY', r'(?P<year>\d\d)').replace(
2985    'MM', r'(?P<month>[01]\d)').replace(
2986    'DD', r'(?P<day>[0123]\d)').replace(
2987    'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
2988    'CC', r'(?P<century>\d\d$)')
2989    + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
2990    + r'(:(?P<second>\d{2}))?'
2991    + r'(\.(?P<fracsecond>\d+))?'
2992    + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
2993    for tmpl in _iso8601_tmpl]
2994try:
2995    del tmpl
2996except NameError:
2997    pass
2998_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
2999try:
3000    del regex
3001except NameError:
3002    pass
3003
3004def _parse_date_iso8601(dateString):
3005    '''Parse a variety of ISO-8601-compatible formats like 20040105'''
3006    m = None
3007    for _iso8601_match in _iso8601_matches:
3008        m = _iso8601_match(dateString)
3009        if m:
3010            break
3011    if not m:
3012        return
3013    if m.span() == (0, 0):
3014        return
3015    params = m.groupdict()
3016    ordinal = params.get('ordinal', 0)
3017    if ordinal:
3018        ordinal = int(ordinal)
3019    else:
3020        ordinal = 0
3021    year = params.get('year', '--')
3022    if not year or year == '--':
3023        year = time.gmtime()[0]
3024    elif len(year) == 2:
3025        # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
3026        year = 100 * int(time.gmtime()[0] / 100) + int(year)
3027    else:
3028        year = int(year)
3029    month = params.get('month', '-')
3030    if not month or month == '-':
3031        # ordinals are NOT normalized by mktime, we simulate them
3032        # by setting month=1, day=ordinal
3033        if ordinal:
3034            month = 1
3035        else:
3036            month = time.gmtime()[1]
3037    month = int(month)
3038    day = params.get('day', 0)
3039    if not day:
3040        # see above
3041        if ordinal:
3042            day = ordinal
3043        elif params.get('century', 0) or \
3044                 params.get('year', 0) or params.get('month', 0):
3045            day = 1
3046        else:
3047            day = time.gmtime()[2]
3048    else:
3049        day = int(day)
3050    # special case of the century - is the first year of the 21st century
3051    # 2000 or 2001 ? The debate goes on...
3052    if 'century' in params:
3053        year = (int(params['century']) - 1) * 100 + 1
3054    # in ISO 8601 most fields are optional
3055    for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
3056        if not params.get(field, None):
3057            params[field] = 0
3058    hour = int(params.get('hour', 0))
3059    minute = int(params.get('minute', 0))
3060    second = int(float(params.get('second', 0)))
3061    # weekday is normalized by mktime(), we can ignore it
3062    weekday = 0
3063    daylight_savings_flag = -1
3064    tm = [year, month, day, hour, minute, second, weekday,
3065          ordinal, daylight_savings_flag]
3066    # ISO 8601 time zone adjustments
3067    tz = params.get('tz')
3068    if tz and tz != 'Z':
3069        if tz[0] == '-':
3070            tm[3] += int(params.get('tzhour', 0))
3071            tm[4] += int(params.get('tzmin', 0))
3072        elif tz[0] == '+':
3073            tm[3] -= int(params.get('tzhour', 0))
3074            tm[4] -= int(params.get('tzmin', 0))
3075        else:
3076            return None
3077    # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
3078    # which is guaranteed to normalize d/m/y/h/m/s.
3079    # Many implementations have bugs, but we'll pretend they don't.
3080    return time.localtime(time.mktime(tuple(tm)))
3081registerDateHandler(_parse_date_iso8601)
3082
3083# 8-bit date handling routines written by ytrewq1.
3084_korean_year  = u'\ub144' # b3e2 in euc-kr
3085_korean_month = u'\uc6d4' # bff9 in euc-kr
3086_korean_day   = u'\uc77c' # c0cf in euc-kr
3087_korean_am    = u'\uc624\uc804' # bfc0 c0fc in euc-kr
3088_korean_pm    = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
3089
3090_korean_onblog_date_re = \
3091    re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
3092               (_korean_year, _korean_month, _korean_day))
3093_korean_nate_date_re = \
3094    re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
3095               (_korean_am, _korean_pm))
3096def _parse_date_onblog(dateString):
3097    '''Parse a string according to the OnBlog 8-bit date format'''
3098    m = _korean_onblog_date_re.match(dateString)
3099    if not m:
3100        return
3101    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3102                {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3103                 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
3104                 'zonediff': '+09:00'}
3105    return _parse_date_w3dtf(w3dtfdate)
3106registerDateHandler(_parse_date_onblog)
3107
3108def _parse_date_nate(dateString):
3109    '''Parse a string according to the Nate 8-bit date format'''
3110    m = _korean_nate_date_re.match(dateString)
3111    if not m:
3112        return
3113    hour = int(m.group(5))
3114    ampm = m.group(4)
3115    if (ampm == _korean_pm):
3116        hour += 12
3117    hour = str(hour)
3118    if len(hour) == 1:
3119        hour = '0' + hour
3120    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3121                {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3122                 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
3123                 'zonediff': '+09:00'}
3124    return _parse_date_w3dtf(w3dtfdate)
3125registerDateHandler(_parse_date_nate)
3126
3127# Unicode strings for Greek date strings
3128_greek_months = \
3129  { \
3130   u'\u0399\u03b1\u03bd': u'Jan',       # c9e1ed in iso-8859-7
3131   u'\u03a6\u03b5\u03b2': u'Feb',       # d6e5e2 in iso-8859-7
3132   u'\u039c\u03ac\u03ce': u'Mar',       # ccdcfe in iso-8859-7
3133   u'\u039c\u03b1\u03ce': u'Mar',       # cce1fe in iso-8859-7
3134   u'\u0391\u03c0\u03c1': u'Apr',       # c1f0f1 in iso-8859-7
3135   u'\u039c\u03ac\u03b9': u'May',       # ccdce9 in iso-8859-7
3136   u'\u039c\u03b1\u03ca': u'May',       # cce1fa in iso-8859-7
3137   u'\u039c\u03b1\u03b9': u'May',       # cce1e9 in iso-8859-7
3138   u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
3139   u'\u0399\u03bf\u03bd': u'Jun',       # c9efed in iso-8859-7
3140   u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
3141   u'\u0399\u03bf\u03bb': u'Jul',       # c9f9eb in iso-8859-7
3142   u'\u0391\u03cd\u03b3': u'Aug',       # c1fde3 in iso-8859-7
3143   u'\u0391\u03c5\u03b3': u'Aug',       # c1f5e3 in iso-8859-7
3144   u'\u03a3\u03b5\u03c0': u'Sep',       # d3e5f0 in iso-8859-7
3145   u'\u039f\u03ba\u03c4': u'Oct',       # cfeaf4 in iso-8859-7
3146   u'\u039d\u03bf\u03ad': u'Nov',       # cdefdd in iso-8859-7
3147   u'\u039d\u03bf\u03b5': u'Nov',       # cdefe5 in iso-8859-7
3148   u'\u0394\u03b5\u03ba': u'Dec',       # c4e5ea in iso-8859-7
3149  }
3150
3151_greek_wdays = \
3152  { \
3153   u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
3154   u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
3155   u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
3156   u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
3157   u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
3158   u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
3159   u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
3160  }
3161
3162_greek_date_format_re = \
3163    re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
3164
3165def _parse_date_greek(dateString):
3166    '''Parse a string according to a Greek 8-bit date format.'''
3167    m = _greek_date_format_re.match(dateString)
3168    if not m:
3169        return
3170    wday = _greek_wdays[m.group(1)]
3171    month = _greek_months[m.group(3)]
3172    rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
3173                 {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
3174                  'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
3175                  'zonediff': m.group(8)}
3176    return _parse_date_rfc822(rfc822date)
3177registerDateHandler(_parse_date_greek)
3178
3179# Unicode strings for Hungarian date strings
3180_hungarian_months = \
3181  { \
3182    u'janu\u00e1r':   u'01',  # e1 in iso-8859-2
3183    u'febru\u00e1ri': u'02',  # e1 in iso-8859-2
3184    u'm\u00e1rcius':  u'03',  # e1 in iso-8859-2
3185    u'\u00e1prilis':  u'04',  # e1 in iso-8859-2
3186    u'm\u00e1ujus':   u'05',  # e1 in iso-8859-2
3187    u'j\u00fanius':   u'06',  # fa in iso-8859-2
3188    u'j\u00falius':   u'07',  # fa in iso-8859-2
3189    u'augusztus':     u'08',
3190    u'szeptember':    u'09',
3191    u'okt\u00f3ber':  u'10',  # f3 in iso-8859-2
3192    u'november':      u'11',
3193    u'december':      u'12',
3194  }
3195
3196_hungarian_date_format_re = \
3197  re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
3198
3199def _parse_date_hungarian(dateString):
3200    '''Parse a string according to a Hungarian 8-bit date format.'''
3201    m = _hungarian_date_format_re.match(dateString)
3202    if not m or m.group(2) not in _hungarian_months:
3203        return None
3204    month = _hungarian_months[m.group(2)]
3205    day = m.group(3)
3206    if len(day) == 1:
3207        day = '0' + day
3208    hour = m.group(4)
3209    if len(hour) == 1:
3210        hour = '0' + hour
3211    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
3212                {'year': m.group(1), 'month': month, 'day': day,\
3213                 'hour': hour, 'minute': m.group(5),\
3214                 'zonediff': m.group(6)}
3215    return _parse_date_w3dtf(w3dtfdate)
3216registerDateHandler(_parse_date_hungarian)
3217
3218timezonenames = {
3219    'ut': 0, 'gmt': 0, 'z': 0,
3220    'adt': -3, 'ast': -4, 'at': -4,
3221    'edt': -4, 'est': -5, 'et': -5,
3222    'cdt': -5, 'cst': -6, 'ct': -6,
3223    'mdt': -6, 'mst': -7, 'mt': -7,
3224    'pdt': -7, 'pst': -8, 'pt': -8,
3225    'a': -1, 'n': 1,
3226    'm': -12, 'y': 12,
3227}
3228# W3 date and time format parser
3229# http://www.w3.org/TR/NOTE-datetime
3230# Also supports MSSQL-style datetimes as defined at:
3231# http://msdn.microsoft.com/en-us/library/ms186724.aspx
3232# (basically, allow a space as a date/time/timezone separator)
3233def _parse_date_w3dtf(datestr):
3234    if not datestr.strip():
3235        return None
3236    parts = datestr.lower().split('t')
3237    if len(parts) == 1:
3238        # This may be a date only, or may be an MSSQL-style date
3239        parts = parts[0].split()
3240        if len(parts) == 1:
3241            # Treat this as a date only
3242            parts.append('00:00:00z')
3243    elif len(parts) > 2:
3244        return None
3245    date = parts[0].split('-', 2)
3246    if not date or len(date[0]) != 4:
3247        return None
3248    # Ensure that `date` has 3 elements. Using '1' sets the default
3249    # month to January and the default day to the 1st of the month.
3250    date.extend(['1'] * (3 - len(date)))
3251    try:
3252        year, month, day = [int(i) for i in date]
3253    except ValueError:
3254        # `date` may have more than 3 elements or may contain
3255        # non-integer strings.
3256        return None
3257    if parts[1].endswith('z'):
3258        parts[1] = parts[1][:-1]
3259        parts.append('z')
3260    # Append the numeric timezone offset, if any, to parts.
3261    # If this is an MSSQL-style date then parts[2] already contains
3262    # the timezone information, so `append()` will not affect it.
3263    # Add 1 to each value so that if `find()` returns -1 it will be
3264    # treated as False.
3265    loc = parts[1].find('-') + 1 or parts[1].find('+') + 1 or len(parts[1]) + 1
3266    loc = loc - 1
3267    parts.append(parts[1][loc:])
3268    parts[1] = parts[1][:loc]
3269    time = parts[1].split(':', 2)
3270    # Ensure that time has 3 elements. Using '0' means that the
3271    # minutes and seconds, if missing, will default to 0.
3272    time.extend(['0'] * (3 - len(time)))
3273    tzhour = 0
3274    tzmin = 0
3275    if parts[2][:1] in ('-', '+'):
3276        try:
3277            tzhour = int(parts[2][1:3])
3278            tzmin = int(parts[2][4:])
3279        except ValueError:
3280            return None
3281        if parts[2].startswith('-'):
3282            tzhour = tzhour * -1
3283            tzmin = tzmin * -1
3284    else:
3285        tzhour = timezonenames.get(parts[2], 0)
3286    try:
3287        hour, minute, second = [int(float(i)) for i in time]
3288    except ValueError:
3289        return None
3290    # Create the datetime object and timezone delta objects
3291    try:
3292        stamp = datetime.datetime(year, month, day, hour, minute, second)
3293    except ValueError:
3294        return None
3295    delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
3296    # Return the date and timestamp in a UTC 9-tuple
3297    try:
3298        return (stamp - delta).utctimetuple()
3299    except (OverflowError, ValueError):
3300        # IronPython throws ValueErrors instead of OverflowErrors
3301        return None
3302
3303registerDateHandler(_parse_date_w3dtf)
3304
3305def _parse_date_rfc822(date):
3306    """Parse RFC 822 dates and times
3307    http://tools.ietf.org/html/rfc822#section-5
3308
3309    There are some formatting differences that are accounted for:
3310    1. Years may be two or four digits.
3311    2. The month and day can be swapped.
3312    3. Additional timezone names are supported.
3313    4. A default time and timezone are assumed if only a date is present.
3314    """
3315    daynames = set(['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'])
3316    months = {
3317        'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
3318        'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
3319    }
3320
3321    parts = date.lower().split()
3322    if len(parts) < 5:
3323        # Assume that the time and timezone are missing
3324        parts.extend(('00:00:00', '0000'))
3325    # Remove the day name
3326    if parts[0][:3] in daynames:
3327        parts = parts[1:]
3328    if len(parts) < 5:
3329        # If there are still fewer than five parts, there's not enough
3330        # information to interpret this
3331        return None
3332    try:
3333        day = int(parts[0])
3334    except ValueError:
3335        # Check if the day and month are swapped
3336        if months.get(parts[0][:3]):
3337            try:
3338                day = int(parts[1])
3339            except ValueError:
3340                return None
3341            else:
3342                parts[1] = parts[0]
3343        else:
3344            return None
3345    month = months.get(parts[1][:3])
3346    if not month:
3347        return None
3348    try:
3349        year = int(parts[2])
3350    except ValueError:
3351        return None
3352    # Normalize two-digit years:
3353    # Anything in the 90's is interpreted as 1990 and on
3354    # Anything 89 or less is interpreted as 2089 or before
3355    if len(parts[2]) <= 2:
3356        year += (1900, 2000)[year < 90]
3357    timeparts = parts[3].split(':')
3358    timeparts = timeparts + ([0] * (3 - len(timeparts)))
3359    try:
3360        (hour, minute, second) = map(int, timeparts)
3361    except ValueError:
3362        return None
3363    tzhour = 0
3364    tzmin = 0
3365    # Strip 'Etc/' from the timezone
3366    if parts[4].startswith('etc/'):
3367        parts[4] = parts[4][4:]
3368    # Normalize timezones that start with 'gmt':
3369    # GMT-05:00 => -0500
3370    # GMT => GMT
3371    if parts[4].startswith('gmt'):
3372        parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt'
3373    # Handle timezones like '-0500', '+0500', and 'EST'
3374    if parts[4] and parts[4][0] in ('-', '+'):
3375        try:
3376            tzhour = int(parts[4][1:3])
3377            tzmin = int(parts[4][3:])
3378        except ValueError:
3379            return None
3380        if parts[4].startswith('-'):
3381            tzhour = tzhour * -1
3382            tzmin = tzmin * -1
3383    else:
3384        tzhour = timezonenames.get(parts[4], 0)
3385    # Create the datetime object and timezone delta objects
3386    try:
3387        stamp = datetime.datetime(year, month, day, hour, minute, second)
3388    except ValueError:
3389        return None
3390    delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
3391    # Return the date and timestamp in a UTC 9-tuple
3392    try:
3393        return (stamp - delta).utctimetuple()
3394    except (OverflowError, ValueError):
3395        # IronPython throws ValueErrors instead of OverflowErrors
3396        return None
3397registerDateHandler(_parse_date_rfc822)
3398
3399_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
3400           'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
3401def _parse_date_asctime(dt):
3402    """Parse asctime-style dates.
3403
3404    Converts asctime to RFC822-compatible dates and uses the RFC822 parser
3405    to do the actual parsing.
3406
3407    Supported formats (format is standardized to the first one listed):
3408
3409    * {weekday name} {month name} dd hh:mm:ss {+-tz} yyyy
3410    * {weekday name} {month name} dd hh:mm:ss yyyy
3411    """
3412
3413    parts = dt.split()
3414
3415    # Insert a GMT timezone, if needed.
3416    if len(parts) == 5:
3417        parts.insert(4, '+0000')
3418
3419    # Exit if there are not six parts.
3420    if len(parts) != 6:
3421        return None
3422
3423    # Reassemble the parts in an RFC822-compatible order and parse them.
3424    return _parse_date_rfc822(' '.join([
3425        parts[0], parts[2], parts[1], parts[5], parts[3], parts[4],
3426    ]))
3427registerDateHandler(_parse_date_asctime)
3428
3429def _parse_date_perforce(aDateString):
3430    """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
3431    # Fri, 2006/09/15 08:19:53 EDT
3432    _my_date_pattern = re.compile( \
3433        r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
3434
3435    m = _my_date_pattern.search(aDateString)
3436    if m is None:
3437        return None
3438    dow, year, month, day, hour, minute, second, tz = m.groups()
3439    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
3440    dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
3441    tm = rfc822.parsedate_tz(dateString)
3442    if tm:
3443        return time.gmtime(rfc822.mktime_tz(tm))
3444registerDateHandler(_parse_date_perforce)
3445
3446def _parse_date(dateString):
3447    '''Parses a variety of date formats into a 9-tuple in GMT'''
3448    if not dateString:
3449        return None
3450    for handler in _date_handlers:
3451        try:
3452            date9tuple = handler(dateString)
3453        except (KeyError, OverflowError, ValueError):
3454            continue
3455        if not date9tuple:
3456            continue
3457        if len(date9tuple) != 9:
3458            continue
3459        return date9tuple
3460    return None
3461
3462# Each marker represents some of the characters of the opening XML
3463# processing instruction ('<?xm') in the specified encoding.
3464EBCDIC_MARKER = _l2bytes([0x4C, 0x6F, 0xA7, 0x94])
3465UTF16BE_MARKER = _l2bytes([0x00, 0x3C, 0x00, 0x3F])
3466UTF16LE_MARKER = _l2bytes([0x3C, 0x00, 0x3F, 0x00])
3467UTF32BE_MARKER = _l2bytes([0x00, 0x00, 0x00, 0x3C])
3468UTF32LE_MARKER = _l2bytes([0x3C, 0x00, 0x00, 0x00])
3469
3470ZERO_BYTES = _l2bytes([0x00, 0x00])
3471
3472# Match the opening XML declaration.
3473# Example: <?xml version="1.0" encoding="utf-8"?>
3474RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>')
3475
3476# Capture the value of the XML processing instruction's encoding attribute.
3477# Example: <?xml version="1.0" encoding="utf-8"?>
3478RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'))
3479
3480def convert_to_utf8(http_headers, data):
3481    '''Detect and convert the character encoding to UTF-8.
3482
3483    http_headers is a dictionary
3484    data is a raw string (not Unicode)'''
3485
3486    # This is so much trickier than it sounds, it's not even funny.
3487    # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
3488    # is application/xml, application/*+xml,
3489    # application/xml-external-parsed-entity, or application/xml-dtd,
3490    # the encoding given in the charset parameter of the HTTP Content-Type
3491    # takes precedence over the encoding given in the XML prefix within the
3492    # document, and defaults to 'utf-8' if neither are specified.  But, if
3493    # the HTTP Content-Type is text/xml, text/*+xml, or
3494    # text/xml-external-parsed-entity, the encoding given in the XML prefix
3495    # within the document is ALWAYS IGNORED and only the encoding given in
3496    # the charset parameter of the HTTP Content-Type header should be
3497    # respected, and it defaults to 'us-ascii' if not specified.
3498
3499    # Furthermore, discussion on the atom-syntax mailing list with the
3500    # author of RFC 3023 leads me to the conclusion that any document
3501    # served with a Content-Type of text/* and no charset parameter
3502    # must be treated as us-ascii.  (We now do this.)  And also that it
3503    # must always be flagged as non-well-formed.  (We now do this too.)
3504
3505    # If Content-Type is unspecified (input was local file or non-HTTP source)
3506    # or unrecognized (server just got it totally wrong), then go by the
3507    # encoding given in the XML prefix of the document and default to
3508    # 'iso-8859-1' as per the HTTP specification (RFC 2616).
3509
3510    # Then, assuming we didn't find a character encoding in the HTTP headers
3511    # (and the HTTP Content-type allowed us to look in the body), we need
3512    # to sniff the first few bytes of the XML data and try to determine
3513    # whether the encoding is ASCII-compatible.  Section F of the XML
3514    # specification shows the way here:
3515    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3516
3517    # If the sniffed encoding is not ASCII-compatible, we need to make it
3518    # ASCII compatible so that we can sniff further into the XML declaration
3519    # to find the encoding attribute, which will tell us the true encoding.
3520
3521    # Of course, none of this guarantees that we will be able to parse the
3522    # feed in the declared character encoding (assuming it was declared
3523    # correctly, which many are not).  iconv_codec can help a lot;
3524    # you should definitely install it if you can.
3525    # http://cjkpython.i18n.org/
3526
3527    bom_encoding = u''
3528    xml_encoding = u''
3529    rfc3023_encoding = u''
3530
3531    # Look at the first few bytes of the document to guess what
3532    # its encoding may be. We only need to decode enough of the
3533    # document that we can use an ASCII-compatible regular
3534    # expression to search for an XML encoding declaration.
3535    # The heuristic follows the XML specification, section F:
3536    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3537    # Check for BOMs first.
3538    if data[:4] == codecs.BOM_UTF32_BE:
3539        bom_encoding = u'utf-32be'
3540        data = data[4:]
3541    elif data[:4] == codecs.BOM_UTF32_LE:
3542        bom_encoding = u'utf-32le'
3543        data = data[4:]
3544    elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
3545        bom_encoding = u'utf-16be'
3546        data = data[2:]
3547    elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
3548        bom_encoding = u'utf-16le'
3549        data = data[2:]
3550    elif data[:3] == codecs.BOM_UTF8:
3551        bom_encoding = u'utf-8'
3552        data = data[3:]
3553    # Check for the characters '<?xm' in several encodings.
3554    elif data[:4] == EBCDIC_MARKER:
3555        bom_encoding = u'cp037'
3556    elif data[:4] == UTF16BE_MARKER:
3557        bom_encoding = u'utf-16be'
3558    elif data[:4] == UTF16LE_MARKER:
3559        bom_encoding = u'utf-16le'
3560    elif data[:4] == UTF32BE_MARKER:
3561        bom_encoding = u'utf-32be'
3562    elif data[:4] == UTF32LE_MARKER:
3563        bom_encoding = u'utf-32le'
3564
3565    tempdata = data
3566    try:
3567        if bom_encoding:
3568            tempdata = data.decode(bom_encoding).encode('utf-8')
3569    except (UnicodeDecodeError, LookupError):
3570        # feedparser recognizes UTF-32 encodings that aren't
3571        # available in Python 2.4 and 2.5, so it's possible to
3572        # encounter a LookupError during decoding.
3573        xml_encoding_match = None
3574    else:
3575        xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
3576
3577    if xml_encoding_match:
3578        xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
3579        # Normalize the xml_encoding if necessary.
3580        if bom_encoding and (xml_encoding in (
3581            u'u16', u'utf-16', u'utf16', u'utf_16',
3582            u'u32', u'utf-32', u'utf32', u'utf_32',
3583            u'iso-10646-ucs-2', u'iso-10646-ucs-4',
3584            u'csucs4', u'csunicode', u'ucs-2', u'ucs-4'
3585        )):
3586            xml_encoding = bom_encoding
3587
3588    # Find the HTTP Content-Type and, hopefully, a character
3589    # encoding provided by the server. The Content-Type is used
3590    # to choose the "correct" encoding among the BOM encoding,
3591    # XML declaration encoding, and HTTP encoding, following the
3592    # heuristic defined in RFC 3023.
3593    http_content_type = http_headers.get('content-type') or ''
3594    http_content_type, params = cgi.parse_header(http_content_type)
3595    http_encoding = params.get('charset', '').replace("'", "")
3596    if not isinstance(http_encoding, unicode):
3597        http_encoding = http_encoding.decode('utf-8', 'ignore')
3598
3599    acceptable_content_type = 0
3600    application_content_types = (u'application/xml', u'application/xml-dtd',
3601                                 u'application/xml-external-parsed-entity')
3602    text_content_types = (u'text/xml', u'text/xml-external-parsed-entity')
3603    if (http_content_type in application_content_types) or \
3604       (http_content_type.startswith(u'application/') and
3605        http_content_type.endswith(u'+xml')):
3606        acceptable_content_type = 1
3607        rfc3023_encoding = http_encoding or xml_encoding or u'utf-8'
3608    elif (http_content_type in text_content_types) or \
3609         (http_content_type.startswith(u'text/') and
3610          http_content_type.endswith(u'+xml')):
3611        acceptable_content_type = 1
3612        rfc3023_encoding = http_encoding or u'us-ascii'
3613    elif http_content_type.startswith(u'text/'):
3614        rfc3023_encoding = http_encoding or u'us-ascii'
3615    elif http_headers and 'content-type' not in http_headers:
3616        rfc3023_encoding = xml_encoding or u'iso-8859-1'
3617    else:
3618        rfc3023_encoding = xml_encoding or u'utf-8'
3619    # gb18030 is a superset of gb2312, so always replace gb2312
3620    # with gb18030 for greater compatibility.
3621    if rfc3023_encoding.lower() == u'gb2312':
3622        rfc3023_encoding = u'gb18030'
3623    if xml_encoding.lower() == u'gb2312':
3624        xml_encoding = u'gb18030'
3625
3626    # there are four encodings to keep track of:
3627    # - http_encoding is the encoding declared in the Content-Type HTTP header
3628    # - xml_encoding is the encoding declared in the <?xml declaration
3629    # - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
3630    # - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
3631    error = None
3632
3633    if http_headers and (not acceptable_content_type):
3634        if 'content-type' in http_headers:
3635            msg = '%s is not an XML media type' % http_headers['content-type']
3636        else:
3637            msg = 'no Content-type specified'
3638        error = NonXMLContentType(msg)
3639
3640    # determine character encoding
3641    known_encoding = 0
3642    lazy_chardet_encoding = None
3643    tried_encodings = []
3644    if chardet:
3645        def lazy_chardet_encoding():
3646            chardet_encoding = chardet.detect(data)['encoding']
3647            if not chardet_encoding:
3648                chardet_encoding = ''
3649            if not isinstance(chardet_encoding, unicode):
3650                chardet_encoding = unicode(chardet_encoding, 'ascii', 'ignore')
3651            return chardet_encoding
3652    # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
3653    for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding,
3654                              lazy_chardet_encoding, u'utf-8', u'windows-1252', u'iso-8859-2'):
3655        if callable(proposed_encoding):
3656            proposed_encoding = proposed_encoding()
3657        if not proposed_encoding:
3658            continue
3659        if proposed_encoding in tried_encodings:
3660            continue
3661        tried_encodings.append(proposed_encoding)
3662        try:
3663            data = data.decode(proposed_encoding)
3664        except (UnicodeDecodeError, LookupError):
3665            pass
3666        else:
3667            known_encoding = 1
3668            # Update the encoding in the opening XML processing instruction.
3669            new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
3670            if RE_XML_DECLARATION.search(data):
3671                data = RE_XML_DECLARATION.sub(new_declaration, data)
3672            else:
3673                data = new_declaration + u'\n' + data
3674            data = data.encode('utf-8')
3675            break
3676    # if still no luck, give up
3677    if not known_encoding:
3678        error = CharacterEncodingUnknown(
3679            'document encoding unknown, I tried ' +
3680            '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
3681            (rfc3023_encoding, xml_encoding))
3682        rfc3023_encoding = u''
3683    elif proposed_encoding != rfc3023_encoding:
3684        error = CharacterEncodingOverride(
3685            'document declared as %s, but parsed as %s' %
3686            (rfc3023_encoding, proposed_encoding))
3687        rfc3023_encoding = proposed_encoding
3688
3689    return data, rfc3023_encoding, error
3690
3691# Match XML entity declarations.
3692# Example: <!ENTITY copyright "(C)">
3693RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE)
3694
3695# Match XML DOCTYPE declarations.
3696# Example: <!DOCTYPE feed [ ]>
3697RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE)
3698
3699# Match safe entity declarations.
3700# This will allow hexadecimal character references through,
3701# as well as text, but not arbitrary nested entities.
3702# Example: cubed "&#179;"
3703# Example: copyright "(C)"
3704# Forbidden: explode1 "&explode2;&explode2;"
3705RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
3706
3707def replace_doctype(data):
3708    '''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
3709
3710    rss_version may be 'rss091n' or None
3711    stripped_data is the same XML document with a replaced DOCTYPE
3712    '''
3713
3714    # Divide the document into two groups by finding the location
3715    # of the first element that doesn't begin with '<?' or '<!'.
3716    start = re.search(_s2bytes('<\w'), data)
3717    start = start and start.start() or -1
3718    head, data = data[:start+1], data[start+1:]
3719
3720    # Save and then remove all of the ENTITY declarations.
3721    entity_results = RE_ENTITY_PATTERN.findall(head)
3722    head = RE_ENTITY_PATTERN.sub(_s2bytes(''), head)
3723
3724    # Find the DOCTYPE declaration and check the feed type.
3725    doctype_results = RE_DOCTYPE_PATTERN.findall(head)
3726    doctype = doctype_results and doctype_results[0] or _s2bytes('')
3727    if _s2bytes('netscape') in doctype.lower():
3728        version = u'rss091n'
3729    else:
3730        version = None
3731
3732    # Re-insert the safe ENTITY declarations if a DOCTYPE was found.
3733    replacement = _s2bytes('')
3734    if len(doctype_results) == 1 and entity_results:
3735        match_safe_entities = lambda e: RE_SAFE_ENTITY_PATTERN.match(e)
3736        safe_entities = filter(match_safe_entities, entity_results)
3737        if safe_entities:
3738            replacement = _s2bytes('<!DOCTYPE feed [\n<!ENTITY') \
3739                        + _s2bytes('>\n<!ENTITY ').join(safe_entities) \
3740                        + _s2bytes('>\n]>')
3741    data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
3742
3743    # Precompute the safe entities for the loose parser.
3744    safe_entities = dict((k.decode('utf-8'), v.decode('utf-8'))
3745                      for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement))
3746    return version, data, safe_entities
3747
3748
3749# GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates'
3750# items, or None in the case of a parsing error.
3751
3752def _parse_poslist(value, geom_type, swap=True, dims=2):
3753    if geom_type == 'linestring':
3754        return _parse_georss_line(value, swap, dims)
3755    elif geom_type == 'polygon':
3756        ring = _parse_georss_line(value, swap, dims)
3757        return {'type': u'Polygon', 'coordinates': (ring['coordinates'],)}
3758    else:
3759        return None
3760
3761def _gen_georss_coords(value, swap=True, dims=2):
3762    # A generator of (lon, lat) pairs from a string of encoded GeoRSS
3763    # coordinates. Converts to floats and swaps order.
3764    latlons = itertools.imap(float, value.strip().replace(',', ' ').split())
3765    nxt = latlons.next
3766    while True:
3767        t = [nxt(), nxt()][::swap and -1 or 1]
3768        if dims == 3:
3769            t.append(nxt())
3770        yield tuple(t)
3771
3772def _parse_georss_point(value, swap=True, dims=2):
3773    # A point contains a single latitude-longitude pair, separated by
3774    # whitespace. We'll also handle comma separators.
3775    try:
3776        coords = list(_gen_georss_coords(value, swap, dims))
3777        return {u'type': u'Point', u'coordinates': coords[0]}
3778    except (IndexError, ValueError):
3779        return None
3780
3781def _parse_georss_line(value, swap=True, dims=2):
3782    # A line contains a space separated list of latitude-longitude pairs in
3783    # WGS84 coordinate reference system, with each pair separated by
3784    # whitespace. There must be at least two pairs.
3785    try:
3786        coords = list(_gen_georss_coords(value, swap, dims))
3787        return {u'type': u'LineString', u'coordinates': coords}
3788    except (IndexError, ValueError):
3789        return None
3790
3791def _parse_georss_polygon(value, swap=True, dims=2):
3792    # A polygon contains a space separated list of latitude-longitude pairs,
3793    # with each pair separated by whitespace. There must be at least four
3794    # pairs, with the last being identical to the first (so a polygon has a
3795    # minimum of three actual points).
3796    try:
3797        ring = list(_gen_georss_coords(value, swap, dims))
3798    except (IndexError, ValueError):
3799        return None
3800    if len(ring) < 4:
3801        return None
3802    return {u'type': u'Polygon', u'coordinates': (ring,)}
3803
3804def _parse_georss_box(value, swap=True, dims=2):
3805    # A bounding box is a rectangular region, often used to define the extents
3806    # of a map or a rough area of interest. A box contains two space seperate
3807    # latitude-longitude pairs, with each pair separated by whitespace. The
3808    # first pair is the lower corner, the second is the upper corner.
3809    try:
3810        coords = list(_gen_georss_coords(value, swap, dims))
3811        return {u'type': u'Box', u'coordinates': tuple(coords)}
3812    except (IndexError, ValueError):
3813        return None
3814
3815# end geospatial parsers
3816
3817
3818def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
3819    '''Parse a feed from a URL, file, stream, or string.
3820
3821    request_headers, if given, is a dict from http header name to value to add
3822    to the request; this overrides internally generated values.
3823
3824    :return: A :class:`FeedParserDict`.
3825    '''
3826
3827    if handlers is None:
3828        handlers = []
3829    if request_headers is None:
3830        request_headers = {}
3831    if response_headers is None:
3832        response_headers = {}
3833
3834    result = FeedParserDict()
3835    result['feed'] = FeedParserDict()
3836    result['entries'] = []
3837    result['bozo'] = 0
3838    if not isinstance(handlers, list):
3839        handlers = [handlers]
3840    try:
3841        f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
3842        data = f.read()
3843    except Exception, e:
3844        result['bozo'] = 1
3845        result['bozo_exception'] = e
3846        data = None
3847        f = None
3848
3849    if hasattr(f, 'headers'):
3850        result['headers'] = dict(f.headers)
3851    # overwrite existing headers using response_headers
3852    if 'headers' in result:
3853        result['headers'].update(response_headers)
3854    elif response_headers:
3855        result['headers'] = copy.deepcopy(response_headers)
3856
3857    # lowercase all of the HTTP headers for comparisons per RFC 2616
3858    if 'headers' in result:
3859        http_headers = dict((k.lower(), v) for k, v in result['headers'].items())
3860    else:
3861        http_headers = {}
3862
3863    # if feed is gzip-compressed, decompress it
3864    if f and data and http_headers:
3865        if gzip and 'gzip' in http_headers.get('content-encoding', ''):
3866            try:
3867                data = gzip.GzipFile(fileobj=_StringIO(data)).read()
3868            except (IOError, struct.error), e:
3869                # IOError can occur if the gzip header is bad.
3870                # struct.error can occur if the data is damaged.
3871                result['bozo'] = 1
3872                result['bozo_exception'] = e
3873                if isinstance(e, struct.error):
3874                    # A gzip header was found but the data is corrupt.
3875                    # Ideally, we should re-request the feed without the
3876                    # 'Accept-encoding: gzip' header, but we don't.
3877                    data = None
3878        elif zlib and 'deflate' in http_headers.get('content-encoding', ''):
3879            try:
3880                data = zlib.decompress(data)
3881            except zlib.error, e:
3882                try:
3883                    # The data may have no headers and no checksum.
3884                    data = zlib.decompress(data, -15)
3885                except zlib.error, e:
3886                    result['bozo'] = 1
3887                    result['bozo_exception'] = e
3888
3889    # save HTTP headers
3890    if http_headers:
3891        if 'etag' in http_headers:
3892            etag = http_headers.get('etag', u'')
3893            if not isinstance(etag, unicode):
3894                etag = etag.decode('utf-8', 'ignore')
3895            if etag:
3896                result['etag'] = etag
3897        if 'last-modified' in http_headers:
3898            modified = http_headers.get('last-modified', u'')
3899            if modified:
3900                result['modified'] = modified
3901                result['modified_parsed'] = _parse_date(modified)
3902    if hasattr(f, 'url'):
3903        if not isinstance(f.url, unicode):
3904            result['href'] = f.url.decode('utf-8', 'ignore')
3905        else:
3906            result['href'] = f.url
3907        result['status'] = 200
3908    if hasattr(f, 'status'):
3909        result['status'] = f.status
3910    if hasattr(f, 'close'):
3911        f.close()
3912
3913    if data is None:
3914        return result
3915
3916    # Stop processing if the server sent HTTP 304 Not Modified.
3917    if getattr(f, 'code', 0) == 304:
3918        result['version'] = u''
3919        result['debug_message'] = 'The feed has not changed since you last checked, ' + \
3920            'so the server sent no data.  This is a feature, not a bug!'
3921        return result
3922
3923    data, result['encoding'], error = convert_to_utf8(http_headers, data)
3924    use_strict_parser = result['encoding'] and True or False
3925    if error is not None:
3926        result['bozo'] = 1
3927        result['bozo_exception'] = error
3928
3929    result['version'], data, entities = replace_doctype(data)
3930
3931    # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
3932    contentloc = http_headers.get('content-location', u'')
3933    href = result.get('href', u'')
3934    baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
3935
3936    baselang = http_headers.get('content-language', None)
3937    if not isinstance(baselang, unicode) and baselang is not None:
3938        baselang = baselang.decode('utf-8', 'ignore')
3939
3940    if not _XML_AVAILABLE:
3941        use_strict_parser = 0
3942    if use_strict_parser:
3943        # initialize the SAX parser
3944        feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
3945        saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
3946        saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
3947        try:
3948            # disable downloading external doctype references, if possible
3949            saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
3950        except xml.sax.SAXNotSupportedException:
3951            pass
3952        saxparser.setContentHandler(feedparser)
3953        saxparser.setErrorHandler(feedparser)
3954        source = xml.sax.xmlreader.InputSource()
3955        source.setByteStream(_StringIO(data))
3956        try:
3957            saxparser.parse(source)
3958        except xml.sax.SAXException, e:
3959            result['bozo'] = 1
3960            result['bozo_exception'] = feedparser.exc or e
3961            use_strict_parser = 0
3962    if not use_strict_parser and _SGML_AVAILABLE:
3963        feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
3964        feedparser.feed(data.decode('utf-8', 'replace'))
3965    result['feed'] = feedparser.feeddata
3966    result['entries'] = feedparser.entries
3967    result['version'] = result['version'] or feedparser.version
3968    result['namespaces'] = feedparser.namespacesInUse
3969    return result
3970
3971# The list of EPSG codes for geographic (latitude/longitude) coordinate
3972# systems to support decoding of GeoRSS GML profiles.
3973_geogCS = [
39743819, 3821, 3824, 3889, 3906, 4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008,
39754009, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4018, 4019, 4020, 4021, 4022,
39764023, 4024, 4025, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036,
39774041, 4042, 4043, 4044, 4045, 4046, 4047, 4052, 4053, 4054, 4055, 4075, 4081,
39784120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132,
39794133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145,
39804146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158,
39814159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171,
39824172, 4173, 4174, 4175, 4176, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185,
39834188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200,
39844201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213,
39854214, 4215, 4216, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227,
39864228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240,
39874241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4252, 4253,
39884254, 4255, 4256, 4257, 4258, 4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266,
39894267, 4268, 4269, 4270, 4271, 4272, 4273, 4274, 4275, 4276, 4277, 4278, 4279,
39904280, 4281, 4282, 4283, 4284, 4285, 4286, 4287, 4288, 4289, 4291, 4292, 4293,
39914294, 4295, 4296, 4297, 4298, 4299, 4300, 4301, 4302, 4303, 4304, 4306, 4307,
39924308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4322,
39934324, 4326, 4463, 4470, 4475, 4483, 4490, 4555, 4558, 4600, 4601, 4602, 4603,
39944604, 4605, 4606, 4607, 4608, 4609, 4610, 4611, 4612, 4613, 4614, 4615, 4616,
39954617, 4618, 4619, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629,
39964630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642,
39974643, 4644, 4645, 4646, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665,
39984666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678,
39994679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691,
40004692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704,
40014705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717,
40024718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4730,
40034731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743,
40044744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756,
40054757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4801, 4802, 4803, 4804,
40064805, 4806, 4807, 4808, 4809, 4810, 4811, 4813, 4814, 4815, 4816, 4817, 4818,
40074819, 4820, 4821, 4823, 4824, 4901, 4902, 4903, 4904, 4979 ]
Note: See TracBrowser for help on using the repository browser.