Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: OpenRLabs-Git/deploy/rlabs-docker/web2py-rlabs/gluon/contrib/feedparser.py

main

Last change on this file was 42bd667, checked in by David Fuertes <dfuertes@…>, 4 years ago
Historial Limpio
Property mode set to `100755`
File size: 156.3 KB

Line
1	"""Universal feed parser
2
3	Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
4
5	Visit https://code.google.com/p/feedparser/ for the latest version
6	Visit http://packages.python.org/feedparser/ for the latest documentation
7
8	Required: Python 2.4 or later
9	Recommended: iconv_codec <http://cjkpython.i18n.org/>
10	"""
11
12	__version__ = "5.2.1"
13	__license__ = """
14	Copyright 2010-2015 Kurt McKee <contactme@kurtmckee.org>
15	Copyright 2002-2008 Mark Pilgrim
16	All rights reserved.
17
18	Redistribution and use in source and binary forms, with or without modification,
19	are permitted provided that the following conditions are met:
20
21	* Redistributions of source code must retain the above copyright notice,
22	this list of conditions and the following disclaimer.
23	* Redistributions in binary form must reproduce the above copyright notice,
24	this list of conditions and the following disclaimer in the documentation
25	and/or other materials provided with the distribution.
26
27	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
28	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37	POSSIBILITY OF SUCH DAMAGE."""
38	__author__ = "Mark Pilgrim <http://diveintomark.org/>"
39	__contributors__ = ["Jason Diamond <http://injektilo.org/>",
40	"John Beimler <http://john.beimler.org/>",
41	"Fazal Majid <http://www.majid.info/mylos/weblog/>",
42	"Aaron Swartz <http://aaronsw.com/>",
43	"Kevin Marks <http://epeus.blogspot.com/>",
44	"Sam Ruby <http://intertwingly.net/>",
45	"Ade Oshineye <http://blog.oshineye.com/>",
46	"Martin Pool <http://sourcefrog.net/>",
47	"Kurt McKee <http://kurtmckee.org/>",
48	"Bernd Schlapsi <https://github.com/brot>",]
49
50	# HTTP "User-Agent" header to send to servers when downloading feeds.
51	# If you are embedding feedparser in a larger application, you should
52	# change this to your application name and URL.
53	USER_AGENT = "UniversalFeedParser/%s +https://code.google.com/p/feedparser/" % __version__
54
55	# HTTP "Accept" header to send to servers when downloading feeds. If you don't
56	# want to send an Accept header, set this to None.
57	ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,/;q=0.1"
58
59	# List of preferred XML parsers, by SAX driver name. These will be tried first,
60	# but if they're not installed, Python will keep searching through its own list
61	# of pre-installed parsers until it finds one that supports everything we need.
62	PREFERRED_XML_PARSERS = ["drv_libxml2"]
63
64	# If you want feedparser to automatically resolve all relative URIs, set this
65	# to 1.
66	RESOLVE_RELATIVE_URIS = 1
67
68	# If you want feedparser to automatically sanitize all potentially unsafe
69	# HTML content, set this to 1.
70	SANITIZE_HTML = 1
71
72	# ---------- Python 3 modules (make it work if possible) ----------
73	try:
74	import rfc822
75	except ImportError:
76	from email import _parseaddr as rfc822
77
78	try:
79	# Python 3.1 introduces bytes.maketrans and simultaneously
80	# deprecates string.maketrans; use bytes.maketrans if possible
81	_maketrans = bytes.maketrans
82	except (NameError, AttributeError):
83	import string
84	_maketrans = string.maketrans
85
86	# base64 support for Atom feeds that contain embedded binary data
87	try:
88	import base64, binascii
89	except ImportError:
90	base64 = binascii = None
91	else:
92	# Python 3.1 deprecates decodestring in favor of decodebytes
93	_base64decode = getattr(base64, 'decodebytes', base64.decodestring)
94
95	# _s2bytes: convert a UTF-8 str to bytes if the interpreter is Python 3
96	# _l2bytes: convert a list of ints to bytes if the interpreter is Python 3
97	try:
98	if bytes is str:
99	# In Python 2.5 and below, bytes doesn't exist (NameError)
100	# In Python 2.6 and above, bytes and str are the same type
101	raise NameError
102	except NameError:
103	# Python 2
104	def _s2bytes(s):
105	return s
106	def _l2bytes(l):
107	return ''.join(map(chr, l))
108	else:
109	# Python 3
110	def _s2bytes(s):
111	return bytes(s, 'utf8')
112	def _l2bytes(l):
113	return bytes(l)
114
115	# If you want feedparser to allow all URL schemes, set this to ()
116	# List culled from Python's urlparse documentation at:
117	# http://docs.python.org/library/urlparse.html
118	# as well as from "URI scheme" at Wikipedia:
119	# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
120	# Many more will likely need to be added!
121	ACCEPTABLE_URI_SCHEMES = (
122	'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
123	'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
124	'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
125	'wais',
126	# Additional common-but-unofficial schemes
127	'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
128	'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
129	)
130	#ACCEPTABLE_URI_SCHEMES = ()
131
132	# ---------- required modules (should come with any Python distribution) ----------
133	import cgi
134	import codecs
135	import copy
136	import datetime
137	import itertools
138	import re
139	import struct
140	import time
141	import types
142	import urllib
143	import urllib2
144	import urlparse
145	import warnings
146
147	from htmlentitydefs import name2codepoint, codepoint2name, entitydefs
148
149	try:
150	from io import BytesIO as _StringIO
151	except ImportError:
152	try:
153	from cStringIO import StringIO as _StringIO
154	except ImportError:
155	from StringIO import StringIO as _StringIO
156
157	# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
158
159	# gzip is included with most Python distributions, but may not be available if you compiled your own
160	try:
161	import gzip
162	except ImportError:
163	gzip = None
164	try:
165	import zlib
166	except ImportError:
167	zlib = None
168
169	# If a real XML parser is available, feedparser will attempt to use it. feedparser has
170	# been tested with the built-in SAX parser and libxml2. On platforms where the
171	# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
172	# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
173	try:
174	import xml.sax
175	from xml.sax.saxutils import escape as _xmlescape
176	except ImportError:
177	_XML_AVAILABLE = 0
178	def _xmlescape(data,entities={}):
179	data = data.replace('&', '&')
180	data = data.replace('>', '>')
181	data = data.replace('<', '<')
182	for char, entity in entities:
183	data = data.replace(char, entity)
184	return data
185	else:
186	try:
187	xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
188	except xml.sax.SAXReaderNotAvailable:
189	_XML_AVAILABLE = 0
190	else:
191	_XML_AVAILABLE = 1
192
193	# sgmllib is not available by default in Python 3; if the end user doesn't have
194	# it available then we'll lose illformed XML parsing and content santizing
195	try:
196	import sgmllib
197	except ImportError:
198	# This is probably Python 3, which doesn't include sgmllib anymore
199	_SGML_AVAILABLE = 0
200
201	# Mock sgmllib enough to allow subclassing later on
202	class sgmllib(object):
203	class SGMLParser(object):
204	def goahead(self, i):
205	pass
206	def parse_starttag(self, i):
207	pass
208	else:
209	_SGML_AVAILABLE = 1
210
211	# sgmllib defines a number of module-level regular expressions that are
212	# insufficient for the XML parsing feedparser needs. Rather than modify
213	# the variables directly in sgmllib, they're defined here using the same
214	# names, and the compiled code objects of several sgmllib.SGMLParser
215	# methods are copied into _BaseHTMLProcessor so that they execute in
216	# feedparser's scope instead of sgmllib's scope.
217	charref = re.compile('&#(\d+\|[xX][0-9a-fA-F]+);')
218	tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
219	attrfind = re.compile(
220	r'\s([a-zA-Z_][-:.a-zA-Z_0-9])[$]?(\s=\s'
221	r'(\'[^\']\'\|"[^"]"\|[][\-a-zA-Z0-9./,:;+%?!&$\(\)_#=~\'"@]))?'
222	)
223
224	# Unfortunately, these must be copied over to prevent NameError exceptions
225	entityref = sgmllib.entityref
226	incomplete = sgmllib.incomplete
227	interesting = sgmllib.interesting
228	shorttag = sgmllib.shorttag
229	shorttagopen = sgmllib.shorttagopen
230	starttagopen = sgmllib.starttagopen
231
232	class _EndBracketRegEx:
233	def __init__(self):
234	# Overriding the built-in sgmllib.endbracket regex allows the
235	# parser to find angle brackets embedded in element attributes.
236	self.endbracket = re.compile('''([^'"<>]\|"[^"]"(?=>\|/\|\s\|\w+=)\|'[^']'(?=>\|/\|\s\|\w+=))(?=[<>])\|.?(?=[<>])''')
237	def search(self, target, index=0):
238	match = self.endbracket.match(target, index)
239	if match is not None:
240	# Returning a new object in the calling thread's context
241	# resolves a thread-safety.
242	return EndBracketMatch(match)
243	return None
244	class EndBracketMatch:
245	def __init__(self, match):
246	self.match = match
247	def start(self, n):
248	return self.match.end(n)
249	endbracket = _EndBracketRegEx()
250
251
252	# iconv_codec provides support for more character encodings.
253	# It's available from http://cjkpython.i18n.org/
254	try:
255	import iconv_codec
256	except ImportError:
257	pass
258
259	# chardet library auto-detects character encodings
260	# Download from http://chardet.feedparser.org/
261	try:
262	import chardet
263	except ImportError:
264	chardet = None
265
266	# ---------- don't touch these ----------
267	class ThingsNobodyCaresAboutButMe(Exception): pass
268	class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
269	class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
270	class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
271	class UndeclaredNamespace(Exception): pass
272
273	SUPPORTED_VERSIONS = {'': u'unknown',
274	'rss090': u'RSS 0.90',
275	'rss091n': u'RSS 0.91 (Netscape)',
276	'rss091u': u'RSS 0.91 (Userland)',
277	'rss092': u'RSS 0.92',
278	'rss093': u'RSS 0.93',
279	'rss094': u'RSS 0.94',
280	'rss20': u'RSS 2.0',
281	'rss10': u'RSS 1.0',
282	'rss': u'RSS (unknown version)',
283	'atom01': u'Atom 0.1',
284	'atom02': u'Atom 0.2',
285	'atom03': u'Atom 0.3',
286	'atom10': u'Atom 1.0',
287	'atom': u'Atom (unknown version)',
288	'cdf': u'CDF',
289	}
290
291	class FeedParserDict(dict):
292	keymap = {'channel': 'feed',
293	'items': 'entries',
294	'guid': 'id',
295	'date': 'updated',
296	'date_parsed': 'updated_parsed',
297	'description': ['summary', 'subtitle'],
298	'description_detail': ['summary_detail', 'subtitle_detail'],
299	'url': ['href'],
300	'modified': 'updated',
301	'modified_parsed': 'updated_parsed',
302	'issued': 'published',
303	'issued_parsed': 'published_parsed',
304	'copyright': 'rights',
305	'copyright_detail': 'rights_detail',
306	'tagline': 'subtitle',
307	'tagline_detail': 'subtitle_detail'}
308	def __getitem__(self, key):
309	'''
310	:return: A :class:`FeedParserDict`.
311	'''
312	if key == 'category':
313	try:
314	return dict.__getitem__(self, 'tags')[0]['term']
315	except IndexError:
316	raise KeyError, "object doesn't have key 'category'"
317	elif key == 'enclosures':
318	norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
319	return [norel(link) for link in dict.__getitem__(self, 'links') if link['rel']==u'enclosure']
320	elif key == 'license':
321	for link in dict.__getitem__(self, 'links'):
322	if link['rel']==u'license' and 'href' in link:
323	return link['href']
324	elif key == 'updated':
325	# Temporarily help developers out by keeping the old
326	# broken behavior that was reported in issue 310.
327	# This fix was proposed in issue 328.
328	if not dict.__contains__(self, 'updated') and \
329	dict.__contains__(self, 'published'):
330	warnings.warn("To avoid breaking existing software while "
331	"fixing issue 310, a temporary mapping has been created "
332	"from `updated` to `published` if `updated` doesn't "
333	"exist. This fallback will be removed in a future version "
334	"of feedparser.", DeprecationWarning)
335	return dict.__getitem__(self, 'published')
336	return dict.__getitem__(self, 'updated')
337	elif key == 'updated_parsed':
338	if not dict.__contains__(self, 'updated_parsed') and \
339	dict.__contains__(self, 'published_parsed'):
340	warnings.warn("To avoid breaking existing software while "
341	"fixing issue 310, a temporary mapping has been created "
342	"from `updated_parsed` to `published_parsed` if "
343	"`updated_parsed` doesn't exist. This fallback will be "
344	"removed in a future version of feedparser.",
345	DeprecationWarning)
346	return dict.__getitem__(self, 'published_parsed')
347	return dict.__getitem__(self, 'updated_parsed')
348	else:
349	realkey = self.keymap.get(key, key)
350	if isinstance(realkey, list):
351	for k in realkey:
352	if dict.__contains__(self, k):
353	return dict.__getitem__(self, k)
354	elif dict.__contains__(self, realkey):
355	return dict.__getitem__(self, realkey)
356	return dict.__getitem__(self, key)
357
358	def __contains__(self, key):
359	if key in ('updated', 'updated_parsed'):
360	# Temporarily help developers out by keeping the old
361	# broken behavior that was reported in issue 310.
362	# This fix was proposed in issue 328.
363	return dict.__contains__(self, key)
364	try:
365	self.__getitem__(key)
366	except KeyError:
367	return False
368	else:
369	return True
370
371	has_key = __contains__
372
373	def get(self, key, default=None):
374	'''
375	:return: A :class:`FeedParserDict`.
376	'''
377	try:
378	return self.__getitem__(key)
379	except KeyError:
380	return default
381
382	def __setitem__(self, key, value):
383	key = self.keymap.get(key, key)
384	if isinstance(key, list):
385	key = key[0]
386	return dict.__setitem__(self, key, value)
387
388	def setdefault(self, key, value):
389	if key not in self:
390	self[key] = value
391	return value
392	return self[key]
393
394	def __getattr__(self, key):
395	# __getattribute__() is called first; this will be called
396	# only if an attribute was not already found
397	try:
398	return self.__getitem__(key)
399	except KeyError:
400	raise AttributeError, "object has no attribute '%s'" % key
401
402	def __hash__(self):
403	return id(self)
404
405	_cp1252 = {
406	128: unichr(8364), # euro sign
407	130: unichr(8218), # single low-9 quotation mark
408	131: unichr( 402), # latin small letter f with hook
409	132: unichr(8222), # double low-9 quotation mark
410	133: unichr(8230), # horizontal ellipsis
411	134: unichr(8224), # dagger
412	135: unichr(8225), # double dagger
413	136: unichr( 710), # modifier letter circumflex accent
414	137: unichr(8240), # per mille sign
415	138: unichr( 352), # latin capital letter s with caron
416	139: unichr(8249), # single left-pointing angle quotation mark
417	140: unichr( 338), # latin capital ligature oe
418	142: unichr( 381), # latin capital letter z with caron
419	145: unichr(8216), # left single quotation mark
420	146: unichr(8217), # right single quotation mark
421	147: unichr(8220), # left double quotation mark
422	148: unichr(8221), # right double quotation mark
423	149: unichr(8226), # bullet
424	150: unichr(8211), # en dash
425	151: unichr(8212), # em dash
426	152: unichr( 732), # small tilde
427	153: unichr(8482), # trade mark sign
428	154: unichr( 353), # latin small letter s with caron
429	155: unichr(8250), # single right-pointing angle quotation mark
430	156: unichr( 339), # latin small ligature oe
431	158: unichr( 382), # latin small letter z with caron
432	159: unichr( 376), # latin capital letter y with diaeresis
433	}
434
435	_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]://)(/)(.*?)')
436	def _urljoin(base, uri):
437	uri = _urifixer.sub(r'\1\3', uri)
438	if not isinstance(uri, unicode):
439	uri = uri.decode('utf-8', 'ignore')
440	try:
441	uri = urlparse.urljoin(base, uri)
442	except ValueError:
443	uri = u''
444	if not isinstance(uri, unicode):
445	return uri.decode('utf-8', 'ignore')
446	return uri
447
448	class _FeedParserMixin:
449	namespaces = {
450	'': '',
451	'http://backend.userland.com/rss': '',
452	'http://blogs.law.harvard.edu/tech/rss': '',
453	'http://purl.org/rss/1.0/': '',
454	'http://my.netscape.com/rdf/simple/0.9/': '',
455	'http://example.com/newformat#': '',
456	'http://example.com/necho': '',
457	'http://purl.org/echo/': '',
458	'uri/of/echo/namespace#': '',
459	'http://purl.org/pie/': '',
460	'http://purl.org/atom/ns#': '',
461	'http://www.w3.org/2005/Atom': '',
462	'http://purl.org/rss/1.0/modules/rss091#': '',
463
464	'http://webns.net/mvcb/': 'admin',
465	'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
466	'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
467	'http://media.tangent.org/rss/1.0/': 'audio',
468	'http://backend.userland.com/blogChannelModule': 'blogChannel',
469	'http://web.resource.org/cc/': 'cc',
470	'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
471	'http://purl.org/rss/1.0/modules/company': 'co',
472	'http://purl.org/rss/1.0/modules/content/': 'content',
473	'http://my.theinfo.org/changed/1.0/rss/': 'cp',
474	'http://purl.org/dc/elements/1.1/': 'dc',
475	'http://purl.org/dc/terms/': 'dcterms',
476	'http://purl.org/rss/1.0/modules/email/': 'email',
477	'http://purl.org/rss/1.0/modules/event/': 'ev',
478	'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
479	'http://freshmeat.net/rss/fm/': 'fm',
480	'http://xmlns.com/foaf/0.1/': 'foaf',
481	'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
482	'http://www.georss.org/georss': 'georss',
483	'http://www.opengis.net/gml': 'gml',
484	'http://postneo.com/icbm/': 'icbm',
485	'http://purl.org/rss/1.0/modules/image/': 'image',
486	'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
487	'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
488	'http://purl.org/rss/1.0/modules/link/': 'l',
489	'http://search.yahoo.com/mrss': 'media',
490	# Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
491	'http://search.yahoo.com/mrss/': 'media',
492	'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
493	'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
494	'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
495	'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
496	'http://purl.org/rss/1.0/modules/reference/': 'ref',
497	'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
498	'http://purl.org/rss/1.0/modules/search/': 'search',
499	'http://purl.org/rss/1.0/modules/slash/': 'slash',
500	'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
501	'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
502	'http://hacks.benhammersley.com/rss/streaming/': 'str',
503	'http://purl.org/rss/1.0/modules/subscription/': 'sub',
504	'http://purl.org/rss/1.0/modules/syndication/': 'sy',
505	'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
506	'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
507	'http://purl.org/rss/1.0/modules/threading/': 'thr',
508	'http://purl.org/rss/1.0/modules/textinput/': 'ti',
509	'http://madskills.com/public/xml/rss/module/trackback/': 'trackback',
510	'http://wellformedweb.org/commentAPI/': 'wfw',
511	'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
512	'http://www.w3.org/1999/xhtml': 'xhtml',
513	'http://www.w3.org/1999/xlink': 'xlink',
514	'http://www.w3.org/XML/1998/namespace': 'xml',
515	'http://podlove.org/simple-chapters': 'psc',
516	}
517	_matchnamespaces = {}
518
519	can_be_relative_uri = set(['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'])
520	can_contain_relative_uris = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'])
521	can_contain_dangerous_markup = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'])
522	html_types = [u'text/html', u'application/xhtml+xml']
523
524	def __init__(self, baseuri=None, baselang=None, encoding=u'utf-8'):
525	if not self._matchnamespaces:
526	for k, v in self.namespaces.items():
527	self._matchnamespaces[k.lower()] = v
528	self.feeddata = FeedParserDict() # feed-level data
529	self.encoding = encoding # character encoding
530	self.entries = [] # list of entry-level data
531	self.version = u'' # feed type/version, see SUPPORTED_VERSIONS
532	self.namespacesInUse = {} # dictionary of namespaces defined by the feed
533
534	# the following are used internally to track state;
535	# this is really out of control and should be refactored
536	self.infeed = 0
537	self.inentry = 0
538	self.incontent = 0
539	self.intextinput = 0
540	self.inimage = 0
541	self.inauthor = 0
542	self.incontributor = 0
543	self.inpublisher = 0
544	self.insource = 0
545
546	# georss
547	self.ingeometry = 0
548
549	self.sourcedata = FeedParserDict()
550	self.contentparams = FeedParserDict()
551	self._summaryKey = None
552	self.namespacemap = {}
553	self.elementstack = []
554	self.basestack = []
555	self.langstack = []
556	self.baseuri = baseuri or u''
557	self.lang = baselang or None
558	self.svgOK = 0
559	self.title_depth = -1
560	self.depth = 0
561	# psc_chapters_flag prevents multiple psc_chapters from being
562	# captured in a single entry or item. The transition states are
563	# None -> True -> False. psc_chapter elements will only be
564	# captured while it is True.
565	self.psc_chapters_flag = None
566	if baselang:
567	self.feeddata['language'] = baselang.replace('_','-')
568
569	# A map of the following form:
570	# {
571	# object_that_value_is_set_on: {
572	# property_name: depth_of_node_property_was_extracted_from,
573	# other_property: depth_of_node_property_was_extracted_from,
574	# },
575	# }
576	self.property_depth_map = {}
577
578	def _normalize_attributes(self, kv):
579	k = kv[0].lower()
580	v = k in ('rel', 'type') and kv[1].lower() or kv[1]
581	# the sgml parser doesn't handle entities in attributes, nor
582	# does it pass the attribute values through as unicode, while
583	# strict xml parsers do -- account for this difference
584	if isinstance(self, _LooseFeedParser):
585	v = v.replace('&', '&')
586	if not isinstance(v, unicode):
587	v = v.decode('utf-8')
588	return (k, v)
589
590	def unknown_starttag(self, tag, attrs):
591	# increment depth counter
592	self.depth += 1
593
594	# normalize attrs
595	attrs = map(self._normalize_attributes, attrs)
596
597	# track xml:base and xml:lang
598	attrsD = dict(attrs)
599	baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
600	if not isinstance(baseuri, unicode):
601	baseuri = baseuri.decode(self.encoding, 'ignore')
602	# ensure that self.baseuri is always an absolute URI that
603	# uses a whitelisted URI scheme (e.g. not `javscript:`)
604	if self.baseuri:
605	self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri
606	else:
607	self.baseuri = _urljoin(self.baseuri, baseuri)
608	lang = attrsD.get('xml:lang', attrsD.get('lang'))
609	if lang == '':
610	# xml:lang could be explicitly set to '', we need to capture that
611	lang = None
612	elif lang is None:
613	# if no xml:lang is specified, use parent lang
614	lang = self.lang
615	if lang:
616	if tag in ('feed', 'rss', 'rdf:RDF'):
617	self.feeddata['language'] = lang.replace('_','-')
618	self.lang = lang
619	self.basestack.append(self.baseuri)
620	self.langstack.append(lang)
621
622	# track namespaces
623	for prefix, uri in attrs:
624	if prefix.startswith('xmlns:'):
625	self.trackNamespace(prefix[6:], uri)
626	elif prefix == 'xmlns':
627	self.trackNamespace(None, uri)
628
629	# track inline content
630	if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'):
631	if tag in ('xhtml:div', 'div'):
632	return # typepad does this 10/2007
633	# element declared itself as escaped markup, but it isn't really
634	self.contentparams['type'] = u'application/xhtml+xml'
635	if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
636	if tag.find(':') <> -1:
637	prefix, tag = tag.split(':', 1)
638	namespace = self.namespacesInUse.get(prefix, '')
639	if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
640	attrs.append(('xmlns',namespace))
641	if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
642	attrs.append(('xmlns',namespace))
643	if tag == 'svg':
644	self.svgOK += 1
645	return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
646
647	# match namespaces
648	if tag.find(':') <> -1:
649	prefix, suffix = tag.split(':', 1)
650	else:
651	prefix, suffix = '', tag
652	prefix = self.namespacemap.get(prefix, prefix)
653	if prefix:
654	prefix = prefix + '_'
655
656	# special hack for better tracking of empty textinput/image elements in illformed feeds
657	if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
658	self.intextinput = 0
659	if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
660	self.inimage = 0
661
662	# call special handler (if defined) or default handler
663	methodname = '_start_' + prefix + suffix
664	try:
665	method = getattr(self, methodname)
666	return method(attrsD)
667	except AttributeError:
668	# Since there's no handler or something has gone wrong we explicitly add the element and its attributes
669	unknown_tag = prefix + suffix
670	if len(attrsD) == 0:
671	# No attributes so merge it into the encosing dictionary
672	return self.push(unknown_tag, 1)
673	else:
674	# Has attributes so create it in its own dictionary
675	context = self._getContext()
676	context[unknown_tag] = attrsD
677
678	def unknown_endtag(self, tag):
679	# match namespaces
680	if tag.find(':') <> -1:
681	prefix, suffix = tag.split(':', 1)
682	else:
683	prefix, suffix = '', tag
684	prefix = self.namespacemap.get(prefix, prefix)
685	if prefix:
686	prefix = prefix + '_'
687	if suffix == 'svg' and self.svgOK:
688	self.svgOK -= 1
689
690	# call special handler (if defined) or default handler
691	methodname = '_end_' + prefix + suffix
692	try:
693	if self.svgOK:
694	raise AttributeError()
695	method = getattr(self, methodname)
696	method()
697	except AttributeError:
698	self.pop(prefix + suffix)
699
700	# track inline content
701	if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'):
702	# element declared itself as escaped markup, but it isn't really
703	if tag in ('xhtml:div', 'div'):
704	return # typepad does this 10/2007
705	self.contentparams['type'] = u'application/xhtml+xml'
706	if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
707	tag = tag.split(':')[-1]
708	self.handle_data('</%s>' % tag, escape=0)
709
710	# track xml:base and xml:lang going out of scope
711	if self.basestack:
712	self.basestack.pop()
713	if self.basestack and self.basestack[-1]:
714	self.baseuri = self.basestack[-1]
715	if self.langstack:
716	self.langstack.pop()
717	if self.langstack: # and (self.langstack[-1] is not None):
718	self.lang = self.langstack[-1]
719
720	self.depth -= 1
721
722	def handle_charref(self, ref):
723	# called for each character reference, e.g. for ' ', ref will be '160'
724	if not self.elementstack:
725	return
726	ref = ref.lower()
727	if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
728	text = '&#%s;' % ref
729	else:
730	if ref[0] == 'x':
731	c = int(ref[1:], 16)
732	else:
733	c = int(ref)
734	text = unichr(c).encode('utf-8')
735	self.elementstack[-1][2].append(text)
736
737	def handle_entityref(self, ref):
738	# called for each entity reference, e.g. for '©', ref will be 'copy'
739	if not self.elementstack:
740	return
741	if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
742	text = '&%s;' % ref
743	elif ref in self.entities:
744	text = self.entities[ref]
745	if text.startswith('&#') and text.endswith(';'):
746	return self.handle_entityref(text)
747	else:
748	try:
749	name2codepoint[ref]
750	except KeyError:
751	text = '&%s;' % ref
752	else:
753	text = unichr(name2codepoint[ref]).encode('utf-8')
754	self.elementstack[-1][2].append(text)
755
756	def handle_data(self, text, escape=1):
757	# called for each block of plain text, i.e. outside of any tag and
758	# not containing any character or entity references
759	if not self.elementstack:
760	return
761	if escape and self.contentparams.get('type') == u'application/xhtml+xml':
762	text = _xmlescape(text)
763	self.elementstack[-1][2].append(text)
764
765	def handle_comment(self, text):
766	# called for each comment, e.g. <!-- insert message here -->
767	pass
768
769	def handle_pi(self, text):
770	# called for each processing instruction, e.g. <?instruction>
771	pass
772
773	def handle_decl(self, text):
774	pass
775
776	def parse_declaration(self, i):
777	# override internal declaration handler to handle CDATA blocks
778	if self.rawdata[i:i+9] == '<![CDATA[':
779	k = self.rawdata.find(']]>', i)
780	if k == -1:
781	# CDATA block began but didn't finish
782	k = len(self.rawdata)
783	return k
784	self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
785	return k+3
786	else:
787	k = self.rawdata.find('>', i)
788	if k >= 0:
789	return k+1
790	else:
791	# We have an incomplete CDATA block.
792	return k
793
794	def mapContentType(self, contentType):
795	contentType = contentType.lower()
796	if contentType == 'text' or contentType == 'plain':
797	contentType = u'text/plain'
798	elif contentType == 'html':
799	contentType = u'text/html'
800	elif contentType == 'xhtml':
801	contentType = u'application/xhtml+xml'
802	return contentType
803
804	def trackNamespace(self, prefix, uri):
805	loweruri = uri.lower()
806	if not self.version:
807	if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/'):
808	self.version = u'rss090'
809	elif loweruri == 'http://purl.org/rss/1.0/':
810	self.version = u'rss10'
811	elif loweruri == 'http://www.w3.org/2005/atom':
812	self.version = u'atom10'
813	if loweruri.find(u'backend.userland.com/rss') <> -1:
814	# match any backend.userland.com namespace
815	uri = u'http://backend.userland.com/rss'
816	loweruri = uri
817	if loweruri in self._matchnamespaces:
818	self.namespacemap[prefix] = self._matchnamespaces[loweruri]
819	self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
820	else:
821	self.namespacesInUse[prefix or ''] = uri
822
823	def resolveURI(self, uri):
824	return _urljoin(self.baseuri or u'', uri)
825
826	def decodeEntities(self, element, data):
827	return data
828
829	def strattrs(self, attrs):
830	return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs])
831
832	def push(self, element, expectingText):
833	self.elementstack.append([element, expectingText, []])
834
835	def pop(self, element, stripWhitespace=1):
836	if not self.elementstack:
837	return
838	if self.elementstack[-1][0] != element:
839	return
840
841	element, expectingText, pieces = self.elementstack.pop()
842
843	if self.version == u'atom10' and self.contentparams.get('type', u'text') == u'application/xhtml+xml':
844	# remove enclosing child element, but only if it is a <div> and
845	# only if all the remaining content is nested underneath it.
846	# This means that the divs would be retained in the following:
847	# <div>foo</div><div>bar</div>
848	while pieces and len(pieces)>1 and not pieces[-1].strip():
849	del pieces[-1]
850	while pieces and len(pieces)>1 and not pieces[0].strip():
851	del pieces[0]
852	if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
853	depth = 0
854	for piece in pieces[:-1]:
855	if piece.startswith('</'):
856	depth -= 1
857	if depth == 0:
858	break
859	elif piece.startswith('<') and not piece.endswith('/>'):
860	depth += 1
861	else:
862	pieces = pieces[1:-1]
863
864	# Ensure each piece is a str for Python 3
865	for (i, v) in enumerate(pieces):
866	if not isinstance(v, unicode):
867	pieces[i] = v.decode('utf-8')
868
869	output = u''.join(pieces)
870	if stripWhitespace:
871	output = output.strip()
872	if not expectingText:
873	return output
874
875	# decode base64 content
876	if base64 and self.contentparams.get('base64', 0):
877	try:
878	output = _base64decode(output)
879	except binascii.Error:
880	pass
881	except binascii.Incomplete:
882	pass
883	except TypeError:
884	# In Python 3, base64 takes and outputs bytes, not str
885	# This may not be the most correct way to accomplish this
886	output = _base64decode(output.encode('utf-8')).decode('utf-8')
887
888	# resolve relative URIs
889	if (element in self.can_be_relative_uri) and output:
890	# do not resolve guid elements with isPermalink="false"
891	if not element == 'id' or self.guidislink:
892	output = self.resolveURI(output)
893
894	# decode entities within embedded markup
895	if not self.contentparams.get('base64', 0):
896	output = self.decodeEntities(element, output)
897
898	# some feed formats require consumers to guess
899	# whether the content is html or plain text
900	if not self.version.startswith(u'atom') and self.contentparams.get('type') == u'text/plain':
901	if self.lookslikehtml(output):
902	self.contentparams['type'] = u'text/html'
903
904	# remove temporary cruft from contentparams
905	try:
906	del self.contentparams['mode']
907	except KeyError:
908	pass
909	try:
910	del self.contentparams['base64']
911	except KeyError:
912	pass
913
914	is_htmlish = self.mapContentType(self.contentparams.get('type', u'text/html')) in self.html_types
915	# resolve relative URIs within embedded markup
916	if is_htmlish and RESOLVE_RELATIVE_URIS:
917	if element in self.can_contain_relative_uris:
918	output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html'))
919
920	# sanitize embedded markup
921	if is_htmlish and SANITIZE_HTML:
922	if element in self.can_contain_dangerous_markup:
923	output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', u'text/html'))
924
925	if self.encoding and not isinstance(output, unicode):
926	output = output.decode(self.encoding, 'ignore')
927
928	# address common error where people take data that is already
929	# utf-8, presume that it is iso-8859-1, and re-encode it.
930	if self.encoding in (u'utf-8', u'utf-8_INVALID_PYTHON_3') and isinstance(output, unicode):
931	try:
932	output = output.encode('iso-8859-1').decode('utf-8')
933	except (UnicodeEncodeError, UnicodeDecodeError):
934	pass
935
936	# map win-1252 extensions to the proper code points
937	if isinstance(output, unicode):
938	output = output.translate(_cp1252)
939
940	# categories/tags/keywords/whatever are handled in _end_category or _end_tags or _end_itunes_keywords
941	if element in ('category', 'tags', 'itunes_keywords'):
942	return output
943
944	if element == 'title' and -1 < self.title_depth <= self.depth:
945	return output
946
947	# store output in appropriate place(s)
948	if self.inentry and not self.insource:
949	if element == 'content':
950	self.entries[-1].setdefault(element, [])
951	contentparams = copy.deepcopy(self.contentparams)
952	contentparams['value'] = output
953	self.entries[-1][element].append(contentparams)
954	elif element == 'link':
955	if not self.inimage:
956	# query variables in urls in link elements are improperly
957	# converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
958	# unhandled character references. fix this special case.
959	output = output.replace('&', '&')
960	output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
961	self.entries[-1][element] = output
962	if output:
963	self.entries[-1]['links'][-1]['href'] = output
964	else:
965	if element == 'description':
966	element = 'summary'
967	old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element)
968	if old_value_depth is None or self.depth <= old_value_depth:
969	self.property_depth_map[self.entries[-1]][element] = self.depth
970	self.entries[-1][element] = output
971	if self.incontent:
972	contentparams = copy.deepcopy(self.contentparams)
973	contentparams['value'] = output
974	self.entries[-1][element + '_detail'] = contentparams
975	elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
976	context = self._getContext()
977	if element == 'description':
978	element = 'subtitle'
979	context[element] = output
980	if element == 'link':
981	# fix query variables; see above for the explanation
982	output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
983	context[element] = output
984	context['links'][-1]['href'] = output
985	elif self.incontent:
986	contentparams = copy.deepcopy(self.contentparams)
987	contentparams['value'] = output
988	context[element + '_detail'] = contentparams
989	return output
990
991	def pushContent(self, tag, attrsD, defaultContentType, expectingText):
992	self.incontent += 1
993	if self.lang:
994	self.lang=self.lang.replace('_','-')
995	self.contentparams = FeedParserDict({
996	'type': self.mapContentType(attrsD.get('type', defaultContentType)),
997	'language': self.lang,
998	'base': self.baseuri})
999	self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
1000	self.push(tag, expectingText)
1001
1002	def popContent(self, tag):
1003	value = self.pop(tag)
1004	self.incontent -= 1
1005	self.contentparams.clear()
1006	return value
1007
1008	# a number of elements in a number of RSS variants are nominally plain
1009	# text, but this is routinely ignored. This is an attempt to detect
1010	# the most common cases. As false positives often result in silent
1011	# data loss, this function errs on the conservative side.
1012	@staticmethod
1013	def lookslikehtml(s):
1014	# must have a close tag or an entity reference to qualify
1015	if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)):
1016	return
1017
1018	# all tags must be in a restricted subset of valid HTML tags
1019	if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
1020	re.findall(r'</?(\w+)',s)):
1021	return
1022
1023	# all entities must have been defined as valid HTML entities
1024	if filter(lambda e: e not in entitydefs.keys(), re.findall(r'&(\w+);', s)):
1025	return
1026
1027	return 1
1028
1029	def _mapToStandardPrefix(self, name):
1030	colonpos = name.find(':')
1031	if colonpos <> -1:
1032	prefix = name[:colonpos]
1033	suffix = name[colonpos+1:]
1034	prefix = self.namespacemap.get(prefix, prefix)
1035	name = prefix + ':' + suffix
1036	return name
1037
1038	def _getAttribute(self, attrsD, name):
1039	return attrsD.get(self._mapToStandardPrefix(name))
1040
1041	def _isBase64(self, attrsD, contentparams):
1042	if attrsD.get('mode', '') == 'base64':
1043	return 1
1044	if self.contentparams['type'].startswith(u'text/'):
1045	return 0
1046	if self.contentparams['type'].endswith(u'+xml'):
1047	return 0
1048	if self.contentparams['type'].endswith(u'/xml'):
1049	return 0
1050	return 1
1051
1052	def _itsAnHrefDamnIt(self, attrsD):
1053	href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
1054	if href:
1055	try:
1056	del attrsD['url']
1057	except KeyError:
1058	pass
1059	try:
1060	del attrsD['uri']
1061	except KeyError:
1062	pass
1063	attrsD['href'] = href
1064	return attrsD
1065
1066	def _save(self, key, value, overwrite=False):
1067	context = self._getContext()
1068	if overwrite:
1069	context[key] = value
1070	else:
1071	context.setdefault(key, value)
1072
1073	def _start_rss(self, attrsD):
1074	versionmap = {'0.91': u'rss091u',
1075	'0.92': u'rss092',
1076	'0.93': u'rss093',
1077	'0.94': u'rss094'}
1078	#If we're here then this is an RSS feed.
1079	#If we don't have a version or have a version that starts with something
1080	#other than RSS then there's been a mistake. Correct it.
1081	if not self.version or not self.version.startswith(u'rss'):
1082	attr_version = attrsD.get('version', '')
1083	version = versionmap.get(attr_version)
1084	if version:
1085	self.version = version
1086	elif attr_version.startswith('2.'):
1087	self.version = u'rss20'
1088	else:
1089	self.version = u'rss'
1090
1091	def _start_channel(self, attrsD):
1092	self.infeed = 1
1093	self._cdf_common(attrsD)
1094
1095	def _cdf_common(self, attrsD):
1096	if 'lastmod' in attrsD:
1097	self._start_modified({})
1098	self.elementstack[-1][-1] = attrsD['lastmod']
1099	self._end_modified()
1100	if 'href' in attrsD:
1101	self._start_link({})
1102	self.elementstack[-1][-1] = attrsD['href']
1103	self._end_link()
1104
1105	def _start_feed(self, attrsD):
1106	self.infeed = 1
1107	versionmap = {'0.1': u'atom01',
1108	'0.2': u'atom02',
1109	'0.3': u'atom03'}
1110	if not self.version:
1111	attr_version = attrsD.get('version')
1112	version = versionmap.get(attr_version)
1113	if version:
1114	self.version = version
1115	else:
1116	self.version = u'atom'
1117
1118	def _end_channel(self):
1119	self.infeed = 0
1120	_end_feed = _end_channel
1121
1122	def _start_image(self, attrsD):
1123	context = self._getContext()
1124	if not self.inentry:
1125	context.setdefault('image', FeedParserDict())
1126	self.inimage = 1
1127	self.title_depth = -1
1128	self.push('image', 0)
1129
1130	def _end_image(self):
1131	self.pop('image')
1132	self.inimage = 0
1133
1134	def _start_textinput(self, attrsD):
1135	context = self._getContext()
1136	context.setdefault('textinput', FeedParserDict())
1137	self.intextinput = 1
1138	self.title_depth = -1
1139	self.push('textinput', 0)
1140	_start_textInput = _start_textinput
1141
1142	def _end_textinput(self):
1143	self.pop('textinput')
1144	self.intextinput = 0
1145	_end_textInput = _end_textinput
1146
1147	def _start_author(self, attrsD):
1148	self.inauthor = 1
1149	self.push('author', 1)
1150	# Append a new FeedParserDict when expecting an author
1151	context = self._getContext()
1152	context.setdefault('authors', [])
1153	context['authors'].append(FeedParserDict())
1154	_start_managingeditor = _start_author
1155	_start_dc_author = _start_author
1156	_start_dc_creator = _start_author
1157	_start_itunes_author = _start_author
1158
1159	def _end_author(self):
1160	self.pop('author')
1161	self.inauthor = 0
1162	self._sync_author_detail()
1163	_end_managingeditor = _end_author
1164	_end_dc_author = _end_author
1165	_end_dc_creator = _end_author
1166	_end_itunes_author = _end_author
1167
1168	def _start_itunes_owner(self, attrsD):
1169	self.inpublisher = 1
1170	self.push('publisher', 0)
1171
1172	def _end_itunes_owner(self):
1173	self.pop('publisher')
1174	self.inpublisher = 0
1175	self._sync_author_detail('publisher')
1176
1177	def _start_contributor(self, attrsD):
1178	self.incontributor = 1
1179	context = self._getContext()
1180	context.setdefault('contributors', [])
1181	context['contributors'].append(FeedParserDict())
1182	self.push('contributor', 0)
1183
1184	def _end_contributor(self):
1185	self.pop('contributor')
1186	self.incontributor = 0
1187
1188	def _start_dc_contributor(self, attrsD):
1189	self.incontributor = 1
1190	context = self._getContext()
1191	context.setdefault('contributors', [])
1192	context['contributors'].append(FeedParserDict())
1193	self.push('name', 0)
1194
1195	def _end_dc_contributor(self):
1196	self._end_name()
1197	self.incontributor = 0
1198
1199	def _start_name(self, attrsD):
1200	self.push('name', 0)
1201	_start_itunes_name = _start_name
1202
1203	def _end_name(self):
1204	value = self.pop('name')
1205	if self.inpublisher:
1206	self._save_author('name', value, 'publisher')
1207	elif self.inauthor:
1208	self._save_author('name', value)
1209	elif self.incontributor:
1210	self._save_contributor('name', value)
1211	elif self.intextinput:
1212	context = self._getContext()
1213	context['name'] = value
1214	_end_itunes_name = _end_name
1215
1216	def _start_width(self, attrsD):
1217	self.push('width', 0)
1218
1219	def _end_width(self):
1220	value = self.pop('width')
1221	try:
1222	value = int(value)
1223	except ValueError:
1224	value = 0
1225	if self.inimage:
1226	context = self._getContext()
1227	context['width'] = value
1228
1229	def _start_height(self, attrsD):
1230	self.push('height', 0)
1231
1232	def _end_height(self):
1233	value = self.pop('height')
1234	try:
1235	value = int(value)
1236	except ValueError:
1237	value = 0
1238	if self.inimage:
1239	context = self._getContext()
1240	context['height'] = value
1241
1242	def _start_url(self, attrsD):
1243	self.push('href', 1)
1244	_start_homepage = _start_url
1245	_start_uri = _start_url
1246
1247	def _end_url(self):
1248	value = self.pop('href')
1249	if self.inauthor:
1250	self._save_author('href', value)
1251	elif self.incontributor:
1252	self._save_contributor('href', value)
1253	_end_homepage = _end_url
1254	_end_uri = _end_url
1255
1256	def _start_email(self, attrsD):
1257	self.push('email', 0)
1258	_start_itunes_email = _start_email
1259
1260	def _end_email(self):
1261	value = self.pop('email')
1262	if self.inpublisher:
1263	self._save_author('email', value, 'publisher')
1264	elif self.inauthor:
1265	self._save_author('email', value)
1266	elif self.incontributor:
1267	self._save_contributor('email', value)
1268	_end_itunes_email = _end_email
1269
1270	def _getContext(self):
1271	if self.insource:
1272	context = self.sourcedata
1273	elif self.inimage and 'image' in self.feeddata:
1274	context = self.feeddata['image']
1275	elif self.intextinput:
1276	context = self.feeddata['textinput']
1277	elif self.inentry:
1278	context = self.entries[-1]
1279	else:
1280	context = self.feeddata
1281	return context
1282
1283	def _save_author(self, key, value, prefix='author'):
1284	context = self._getContext()
1285	context.setdefault(prefix + '_detail', FeedParserDict())
1286	context[prefix + '_detail'][key] = value
1287	self._sync_author_detail()
1288	context.setdefault('authors', [FeedParserDict()])
1289	context['authors'][-1][key] = value
1290
1291	def _save_contributor(self, key, value):
1292	context = self._getContext()
1293	context.setdefault('contributors', [FeedParserDict()])
1294	context['contributors'][-1][key] = value
1295
1296	def _sync_author_detail(self, key='author'):
1297	context = self._getContext()
1298	detail = context.get('%ss' % key, [FeedParserDict()])[-1]
1299	if detail:
1300	name = detail.get('name')
1301	email = detail.get('email')
1302	if name and email:
1303	context[key] = u'%s (%s)' % (name, email)
1304	elif name:
1305	context[key] = name
1306	elif email:
1307	context[key] = email
1308	else:
1309	author, email = context.get(key), None
1310	if not author:
1311	return
1312	emailmatch = re.search(ur'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)\|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}\|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
1313	if emailmatch:
1314	email = emailmatch.group(0)
1315	# probably a better way to do the following, but it passes all the tests
1316	author = author.replace(email, u'')
1317	author = author.replace(u'()', u'')
1318	author = author.replace(u'<>', u'')
1319	author = author.replace(u'<>', u'')
1320	author = author.strip()
1321	if author and (author[0] == u'('):
1322	author = author[1:]
1323	if author and (author[-1] == u')'):
1324	author = author[:-1]
1325	author = author.strip()
1326	if author or email:
1327	context.setdefault('%s_detail' % key, detail)
1328	if author:
1329	detail['name'] = author
1330	if email:
1331	detail['email'] = email
1332
1333	def _start_subtitle(self, attrsD):
1334	self.pushContent('subtitle', attrsD, u'text/plain', 1)
1335	_start_tagline = _start_subtitle
1336	_start_itunes_subtitle = _start_subtitle
1337
1338	def _end_subtitle(self):
1339	self.popContent('subtitle')
1340	_end_tagline = _end_subtitle
1341	_end_itunes_subtitle = _end_subtitle
1342
1343	def _start_rights(self, attrsD):
1344	self.pushContent('rights', attrsD, u'text/plain', 1)
1345	_start_dc_rights = _start_rights
1346	_start_copyright = _start_rights
1347
1348	def _end_rights(self):
1349	self.popContent('rights')
1350	_end_dc_rights = _end_rights
1351	_end_copyright = _end_rights
1352
1353	def _start_item(self, attrsD):
1354	self.entries.append(FeedParserDict())
1355	self.push('item', 0)
1356	self.inentry = 1
1357	self.guidislink = 0
1358	self.title_depth = -1
1359	self.psc_chapters_flag = None
1360	id = self._getAttribute(attrsD, 'rdf:about')
1361	if id:
1362	context = self._getContext()
1363	context['id'] = id
1364	self._cdf_common(attrsD)
1365	_start_entry = _start_item
1366
1367	def _end_item(self):
1368	self.pop('item')
1369	self.inentry = 0
1370	_end_entry = _end_item
1371
1372	def _start_dc_language(self, attrsD):
1373	self.push('language', 1)
1374	_start_language = _start_dc_language
1375
1376	def _end_dc_language(self):
1377	self.lang = self.pop('language')
1378	_end_language = _end_dc_language
1379
1380	def _start_dc_publisher(self, attrsD):
1381	self.push('publisher', 1)
1382	_start_webmaster = _start_dc_publisher
1383
1384	def _end_dc_publisher(self):
1385	self.pop('publisher')
1386	self._sync_author_detail('publisher')
1387	_end_webmaster = _end_dc_publisher
1388
1389	def _start_dcterms_valid(self, attrsD):
1390	self.push('validity', 1)
1391
1392	def _end_dcterms_valid(self):
1393	for validity_detail in self.pop('validity').split(';'):
1394	if '=' in validity_detail:
1395	key, value = validity_detail.split('=', 1)
1396	if key == 'start':
1397	self._save('validity_start', value, overwrite=True)
1398	self._save('validity_start_parsed', _parse_date(value), overwrite=True)
1399	elif key == 'end':
1400	self._save('validity_end', value, overwrite=True)
1401	self._save('validity_end_parsed', _parse_date(value), overwrite=True)
1402
1403	def _start_published(self, attrsD):
1404	self.push('published', 1)
1405	_start_dcterms_issued = _start_published
1406	_start_issued = _start_published
1407	_start_pubdate = _start_published
1408
1409	def _end_published(self):
1410	value = self.pop('published')
1411	self._save('published_parsed', _parse_date(value), overwrite=True)
1412	_end_dcterms_issued = _end_published
1413	_end_issued = _end_published
1414	_end_pubdate = _end_published
1415
1416	def _start_updated(self, attrsD):
1417	self.push('updated', 1)
1418	_start_modified = _start_updated
1419	_start_dcterms_modified = _start_updated
1420	_start_dc_date = _start_updated
1421	_start_lastbuilddate = _start_updated
1422
1423	def _end_updated(self):
1424	value = self.pop('updated')
1425	parsed_value = _parse_date(value)
1426	self._save('updated_parsed', parsed_value, overwrite=True)
1427	_end_modified = _end_updated
1428	_end_dcterms_modified = _end_updated
1429	_end_dc_date = _end_updated
1430	_end_lastbuilddate = _end_updated
1431
1432	def _start_created(self, attrsD):
1433	self.push('created', 1)
1434	_start_dcterms_created = _start_created
1435
1436	def _end_created(self):
1437	value = self.pop('created')
1438	self._save('created_parsed', _parse_date(value), overwrite=True)
1439	_end_dcterms_created = _end_created
1440
1441	def _start_expirationdate(self, attrsD):
1442	self.push('expired', 1)
1443
1444	def _end_expirationdate(self):
1445	self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
1446
1447	# geospatial location, or "where", from georss.org
1448
1449	def _start_georssgeom(self, attrsD):
1450	self.push('geometry', 0)
1451	context = self._getContext()
1452	context['where'] = FeedParserDict()
1453
1454	_start_georss_point = _start_georssgeom
1455	_start_georss_line = _start_georssgeom
1456	_start_georss_polygon = _start_georssgeom
1457	_start_georss_box = _start_georssgeom
1458
1459	def _save_where(self, geometry):
1460	context = self._getContext()
1461	context['where'].update(geometry)
1462
1463	def _end_georss_point(self):
1464	geometry = _parse_georss_point(self.pop('geometry'))
1465	if geometry:
1466	self._save_where(geometry)
1467
1468	def _end_georss_line(self):
1469	geometry = _parse_georss_line(self.pop('geometry'))
1470	if geometry:
1471	self._save_where(geometry)
1472
1473	def _end_georss_polygon(self):
1474	this = self.pop('geometry')
1475	geometry = _parse_georss_polygon(this)
1476	if geometry:
1477	self._save_where(geometry)
1478
1479	def _end_georss_box(self):
1480	geometry = _parse_georss_box(self.pop('geometry'))
1481	if geometry:
1482	self._save_where(geometry)
1483
1484	def _start_where(self, attrsD):
1485	self.push('where', 0)
1486	context = self._getContext()
1487	context['where'] = FeedParserDict()
1488	_start_georss_where = _start_where
1489
1490	def _parse_srs_attrs(self, attrsD):
1491	srsName = attrsD.get('srsname')
1492	try:
1493	srsDimension = int(attrsD.get('srsdimension', '2'))
1494	except ValueError:
1495	srsDimension = 2
1496	context = self._getContext()
1497	context['where']['srsName'] = srsName
1498	context['where']['srsDimension'] = srsDimension
1499
1500	def _start_gml_point(self, attrsD):
1501	self._parse_srs_attrs(attrsD)
1502	self.ingeometry = 1
1503	self.push('geometry', 0)
1504
1505	def _start_gml_linestring(self, attrsD):
1506	self._parse_srs_attrs(attrsD)
1507	self.ingeometry = 'linestring'
1508	self.push('geometry', 0)
1509
1510	def _start_gml_polygon(self, attrsD):
1511	self._parse_srs_attrs(attrsD)
1512	self.push('geometry', 0)
1513
1514	def _start_gml_exterior(self, attrsD):
1515	self.push('geometry', 0)
1516
1517	def _start_gml_linearring(self, attrsD):
1518	self.ingeometry = 'polygon'
1519	self.push('geometry', 0)
1520
1521	def _start_gml_pos(self, attrsD):
1522	self.push('pos', 0)
1523
1524	def _end_gml_pos(self):
1525	this = self.pop('pos')
1526	context = self._getContext()
1527	srsName = context['where'].get('srsName')
1528	srsDimension = context['where'].get('srsDimension', 2)
1529	swap = True
1530	if srsName and "EPSG" in srsName:
1531	epsg = int(srsName.split(":")[-1])
1532	swap = bool(epsg in _geogCS)
1533	geometry = _parse_georss_point(this, swap=swap, dims=srsDimension)
1534	if geometry:
1535	self._save_where(geometry)
1536
1537	def _start_gml_poslist(self, attrsD):
1538	self.push('pos', 0)
1539
1540	def _end_gml_poslist(self):
1541	this = self.pop('pos')
1542	context = self._getContext()
1543	srsName = context['where'].get('srsName')
1544	srsDimension = context['where'].get('srsDimension', 2)
1545	swap = True
1546	if srsName and "EPSG" in srsName:
1547	epsg = int(srsName.split(":")[-1])
1548	swap = bool(epsg in _geogCS)
1549	geometry = _parse_poslist(
1550	this, self.ingeometry, swap=swap, dims=srsDimension)
1551	if geometry:
1552	self._save_where(geometry)
1553
1554	def _end_geom(self):
1555	self.ingeometry = 0
1556	self.pop('geometry')
1557	_end_gml_point = _end_geom
1558	_end_gml_linestring = _end_geom
1559	_end_gml_linearring = _end_geom
1560	_end_gml_exterior = _end_geom
1561	_end_gml_polygon = _end_geom
1562
1563	def _end_where(self):
1564	self.pop('where')
1565	_end_georss_where = _end_where
1566
1567	# end geospatial
1568
1569	def _start_cc_license(self, attrsD):
1570	context = self._getContext()
1571	value = self._getAttribute(attrsD, 'rdf:resource')
1572	attrsD = FeedParserDict()
1573	attrsD['rel'] = u'license'
1574	if value:
1575	attrsD['href']=value
1576	context.setdefault('links', []).append(attrsD)
1577
1578	def _start_creativecommons_license(self, attrsD):
1579	self.push('license', 1)
1580	_start_creativeCommons_license = _start_creativecommons_license
1581
1582	def _end_creativecommons_license(self):
1583	value = self.pop('license')
1584	context = self._getContext()
1585	attrsD = FeedParserDict()
1586	attrsD['rel'] = u'license'
1587	if value:
1588	attrsD['href'] = value
1589	context.setdefault('links', []).append(attrsD)
1590	del context['license']
1591	_end_creativeCommons_license = _end_creativecommons_license
1592
1593	def _addTag(self, term, scheme, label):
1594	context = self._getContext()
1595	tags = context.setdefault('tags', [])
1596	if (not term) and (not scheme) and (not label):
1597	return
1598	value = FeedParserDict(term=term, scheme=scheme, label=label)
1599	if value not in tags:
1600	tags.append(value)
1601
1602	def _start_tags(self, attrsD):
1603	# This is a completely-made up element. Its semantics are determined
1604	# only by a single feed that precipitated bug report 392 on Google Code.
1605	# In short, this is junk code.
1606	self.push('tags', 1)
1607
1608	def _end_tags(self):
1609	for term in self.pop('tags').split(','):
1610	self._addTag(term.strip(), None, None)
1611
1612	def _start_category(self, attrsD):
1613	term = attrsD.get('term')
1614	scheme = attrsD.get('scheme', attrsD.get('domain'))
1615	label = attrsD.get('label')
1616	self._addTag(term, scheme, label)
1617	self.push('category', 1)
1618	_start_dc_subject = _start_category
1619	_start_keywords = _start_category
1620
1621	def _start_media_category(self, attrsD):
1622	attrsD.setdefault('scheme', u'http://search.yahoo.com/mrss/category_schema')
1623	self._start_category(attrsD)
1624
1625	def _end_itunes_keywords(self):
1626	for term in self.pop('itunes_keywords').split(','):
1627	if term.strip():
1628	self._addTag(term.strip(), u'http://www.itunes.com/', None)
1629
1630	def _end_media_keywords(self):
1631	for term in self.pop('media_keywords').split(','):
1632	if term.strip():
1633	self._addTag(term.strip(), None, None)
1634
1635	def _start_itunes_category(self, attrsD):
1636	self._addTag(attrsD.get('text'), u'http://www.itunes.com/', None)
1637	self.push('category', 1)
1638
1639	def _end_category(self):
1640	value = self.pop('category')
1641	if not value:
1642	return
1643	context = self._getContext()
1644	tags = context['tags']
1645	if value and len(tags) and not tags[-1]['term']:
1646	tags[-1]['term'] = value
1647	else:
1648	self._addTag(value, None, None)
1649	_end_dc_subject = _end_category
1650	_end_keywords = _end_category
1651	_end_itunes_category = _end_category
1652	_end_media_category = _end_category
1653
1654	def _start_cloud(self, attrsD):
1655	self._getContext()['cloud'] = FeedParserDict(attrsD)
1656
1657	def _start_link(self, attrsD):
1658	attrsD.setdefault('rel', u'alternate')
1659	if attrsD['rel'] == u'self':
1660	attrsD.setdefault('type', u'application/atom+xml')
1661	else:
1662	attrsD.setdefault('type', u'text/html')
1663	context = self._getContext()
1664	attrsD = self._itsAnHrefDamnIt(attrsD)
1665	if 'href' in attrsD:
1666	attrsD['href'] = self.resolveURI(attrsD['href'])
1667	expectingText = self.infeed or self.inentry or self.insource
1668	context.setdefault('links', [])
1669	if not (self.inentry and self.inimage):
1670	context['links'].append(FeedParserDict(attrsD))
1671	if 'href' in attrsD:
1672	expectingText = 0
1673	if (attrsD.get('rel') == u'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1674	context['link'] = attrsD['href']
1675	else:
1676	self.push('link', expectingText)
1677
1678	def _end_link(self):
1679	value = self.pop('link')
1680
1681	def _start_guid(self, attrsD):
1682	self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1683	self.push('id', 1)
1684	_start_id = _start_guid
1685
1686	def _end_guid(self):
1687	value = self.pop('id')
1688	self._save('guidislink', self.guidislink and 'link' not in self._getContext())
1689	if self.guidislink:
1690	# guid acts as link, but only if 'ispermalink' is not present or is 'true',
1691	# and only if the item doesn't already have a link element
1692	self._save('link', value)
1693	_end_id = _end_guid
1694
1695	def _start_title(self, attrsD):
1696	if self.svgOK:
1697	return self.unknown_starttag('title', attrsD.items())
1698	self.pushContent('title', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
1699	_start_dc_title = _start_title
1700	_start_media_title = _start_title
1701
1702	def _end_title(self):
1703	if self.svgOK:
1704	return
1705	value = self.popContent('title')
1706	if not value:
1707	return
1708	self.title_depth = self.depth
1709	_end_dc_title = _end_title
1710
1711	def _end_media_title(self):
1712	title_depth = self.title_depth
1713	self._end_title()
1714	self.title_depth = title_depth
1715
1716	def _start_description(self, attrsD):
1717	context = self._getContext()
1718	if 'summary' in context:
1719	self._summaryKey = 'content'
1720	self._start_content(attrsD)
1721	else:
1722	self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource)
1723	_start_dc_description = _start_description
1724	_start_media_description = _start_description
1725
1726	def _start_abstract(self, attrsD):
1727	self.pushContent('description', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
1728
1729	def _end_description(self):
1730	if self._summaryKey == 'content':
1731	self._end_content()
1732	else:
1733	value = self.popContent('description')
1734	self._summaryKey = None
1735	_end_abstract = _end_description
1736	_end_dc_description = _end_description
1737	_end_media_description = _end_description
1738
1739	def _start_info(self, attrsD):
1740	self.pushContent('info', attrsD, u'text/plain', 1)
1741	_start_feedburner_browserfriendly = _start_info
1742
1743	def _end_info(self):
1744	self.popContent('info')
1745	_end_feedburner_browserfriendly = _end_info
1746
1747	def _start_generator(self, attrsD):
1748	if attrsD:
1749	attrsD = self._itsAnHrefDamnIt(attrsD)
1750	if 'href' in attrsD:
1751	attrsD['href'] = self.resolveURI(attrsD['href'])
1752	self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1753	self.push('generator', 1)
1754
1755	def _end_generator(self):
1756	value = self.pop('generator')
1757	context = self._getContext()
1758	if 'generator_detail' in context:
1759	context['generator_detail']['name'] = value
1760
1761	def _start_admin_generatoragent(self, attrsD):
1762	self.push('generator', 1)
1763	value = self._getAttribute(attrsD, 'rdf:resource')
1764	if value:
1765	self.elementstack[-1][2].append(value)
1766	self.pop('generator')
1767	self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1768
1769	def _start_admin_errorreportsto(self, attrsD):
1770	self.push('errorreportsto', 1)
1771	value = self._getAttribute(attrsD, 'rdf:resource')
1772	if value:
1773	self.elementstack[-1][2].append(value)
1774	self.pop('errorreportsto')
1775
1776	def _start_summary(self, attrsD):
1777	context = self._getContext()
1778	if 'summary' in context:
1779	self._summaryKey = 'content'
1780	self._start_content(attrsD)
1781	else:
1782	self._summaryKey = 'summary'
1783	self.pushContent(self._summaryKey, attrsD, u'text/plain', 1)
1784	_start_itunes_summary = _start_summary
1785
1786	def _end_summary(self):
1787	if self._summaryKey == 'content':
1788	self._end_content()
1789	else:
1790	self.popContent(self._summaryKey or 'summary')
1791	self._summaryKey = None
1792	_end_itunes_summary = _end_summary
1793
1794	def _start_enclosure(self, attrsD):
1795	attrsD = self._itsAnHrefDamnIt(attrsD)
1796	context = self._getContext()
1797	attrsD['rel'] = u'enclosure'
1798	context.setdefault('links', []).append(FeedParserDict(attrsD))
1799
1800	def _start_source(self, attrsD):
1801	if 'url' in attrsD:
1802	# This means that we're processing a source element from an RSS 2.0 feed
1803	self.sourcedata['href'] = attrsD[u'url']
1804	self.push('source', 1)
1805	self.insource = 1
1806	self.title_depth = -1
1807
1808	def _end_source(self):
1809	self.insource = 0
1810	value = self.pop('source')
1811	if value:
1812	self.sourcedata['title'] = value
1813	self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1814	self.sourcedata.clear()
1815
1816	def _start_content(self, attrsD):
1817	self.pushContent('content', attrsD, u'text/plain', 1)
1818	src = attrsD.get('src')
1819	if src:
1820	self.contentparams['src'] = src
1821	self.push('content', 1)
1822
1823	def _start_body(self, attrsD):
1824	self.pushContent('content', attrsD, u'application/xhtml+xml', 1)
1825	_start_xhtml_body = _start_body
1826
1827	def _start_content_encoded(self, attrsD):
1828	self.pushContent('content', attrsD, u'text/html', 1)
1829	_start_fullitem = _start_content_encoded
1830
1831	def _end_content(self):
1832	copyToSummary = self.mapContentType(self.contentparams.get('type')) in ([u'text/plain'] + self.html_types)
1833	value = self.popContent('content')
1834	if copyToSummary:
1835	self._save('summary', value)
1836
1837	_end_body = _end_content
1838	_end_xhtml_body = _end_content
1839	_end_content_encoded = _end_content
1840	_end_fullitem = _end_content
1841
1842	def _start_itunes_image(self, attrsD):
1843	self.push('itunes_image', 0)
1844	if attrsD.get('href'):
1845	self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1846	elif attrsD.get('url'):
1847	self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')})
1848	_start_itunes_link = _start_itunes_image
1849
1850	def _end_itunes_block(self):
1851	value = self.pop('itunes_block', 0)
1852	self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1853
1854	def _end_itunes_explicit(self):
1855	value = self.pop('itunes_explicit', 0)
1856	# Convert 'yes' -> True, 'clean' to False, and any other value to None
1857	# False and None both evaluate as False, so the difference can be ignored
1858	# by applications that only need to know if the content is explicit.
1859	self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
1860
1861	def _start_media_group(self, attrsD):
1862	# don't do anything, but don't break the enclosed tags either
1863	pass
1864
1865	def _start_media_rating(self, attrsD):
1866	context = self._getContext()
1867	context.setdefault('media_rating', attrsD)
1868	self.push('rating', 1)
1869
1870	def _end_media_rating(self):
1871	rating = self.pop('rating')
1872	if rating is not None and rating.strip():
1873	context = self._getContext()
1874	context['media_rating']['content'] = rating
1875
1876	def _start_media_credit(self, attrsD):
1877	context = self._getContext()
1878	context.setdefault('media_credit', [])
1879	context['media_credit'].append(attrsD)
1880	self.push('credit', 1)
1881
1882	def _end_media_credit(self):
1883	credit = self.pop('credit')
1884	if credit != None and len(credit.strip()) != 0:
1885	context = self._getContext()
1886	context['media_credit'][-1]['content'] = credit
1887
1888	def _start_media_restriction(self, attrsD):
1889	context = self._getContext()
1890	context.setdefault('media_restriction', attrsD)
1891	self.push('restriction', 1)
1892
1893	def _end_media_restriction(self):
1894	restriction = self.pop('restriction')
1895	if restriction != None and len(restriction.strip()) != 0:
1896	context = self._getContext()
1897	context['media_restriction']['content'] = [cc.strip().lower() for cc in restriction.split(' ')]
1898
1899	def _start_media_license(self, attrsD):
1900	context = self._getContext()
1901	context.setdefault('media_license', attrsD)
1902	self.push('license', 1)
1903
1904	def _end_media_license(self):
1905	license = self.pop('license')
1906	if license != None and len(license.strip()) != 0:
1907	context = self._getContext()
1908	context['media_license']['content'] = license
1909
1910	def _start_media_content(self, attrsD):
1911	context = self._getContext()
1912	context.setdefault('media_content', [])
1913	context['media_content'].append(attrsD)
1914
1915	def _start_media_thumbnail(self, attrsD):
1916	context = self._getContext()
1917	context.setdefault('media_thumbnail', [])
1918	self.push('url', 1) # new
1919	context['media_thumbnail'].append(attrsD)
1920
1921	def _end_media_thumbnail(self):
1922	url = self.pop('url')
1923	context = self._getContext()
1924	if url != None and len(url.strip()) != 0:
1925	if 'url' not in context['media_thumbnail'][-1]:
1926	context['media_thumbnail'][-1]['url'] = url
1927
1928	def _start_media_player(self, attrsD):
1929	self.push('media_player', 0)
1930	self._getContext()['media_player'] = FeedParserDict(attrsD)
1931
1932	def _end_media_player(self):
1933	value = self.pop('media_player')
1934	context = self._getContext()
1935	context['media_player']['content'] = value
1936
1937	def _start_newlocation(self, attrsD):
1938	self.push('newlocation', 1)
1939
1940	def _end_newlocation(self):
1941	url = self.pop('newlocation')
1942	context = self._getContext()
1943	# don't set newlocation if the context isn't right
1944	if context is not self.feeddata:
1945	return
1946	context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
1947
1948	def _start_psc_chapters(self, attrsD):
1949	if self.psc_chapters_flag is None:
1950	# Transition from None -> True
1951	self.psc_chapters_flag = True
1952	attrsD['chapters'] = []
1953	self._getContext()['psc_chapters'] = FeedParserDict(attrsD)
1954
1955	def _end_psc_chapters(self):
1956	# Transition from True -> False
1957	self.psc_chapters_flag = False
1958
1959	def _start_psc_chapter(self, attrsD):
1960	if self.psc_chapters_flag:
1961	start = self._getAttribute(attrsD, 'start')
1962	attrsD['start_parsed'] = _parse_psc_chapter_start(start)
1963
1964	context = self._getContext()['psc_chapters']
1965	context['chapters'].append(FeedParserDict(attrsD))
1966
1967
1968	if _XML_AVAILABLE:
1969	class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1970	def __init__(self, baseuri, baselang, encoding):
1971	xml.sax.handler.ContentHandler.__init__(self)
1972	_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1973	self.bozo = 0
1974	self.exc = None
1975	self.decls = {}
1976
1977	def startPrefixMapping(self, prefix, uri):
1978	if not uri:
1979	return
1980	# Jython uses '' instead of None; standardize on None
1981	prefix = prefix or None
1982	self.trackNamespace(prefix, uri)
1983	if prefix and uri == 'http://www.w3.org/1999/xlink':
1984	self.decls['xmlns:' + prefix] = uri
1985
1986	def startElementNS(self, name, qname, attrs):
1987	namespace, localname = name
1988	lowernamespace = str(namespace or '').lower()
1989	if lowernamespace.find(u'backend.userland.com/rss') <> -1:
1990	# match any backend.userland.com namespace
1991	namespace = u'http://backend.userland.com/rss'
1992	lowernamespace = namespace
1993	if qname and qname.find(':') > 0:
1994	givenprefix = qname.split(':')[0]
1995	else:
1996	givenprefix = None
1997	prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1998	if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespacesInUse:
1999	raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
2000	localname = str(localname).lower()
2001
2002	# qname implementation is horribly broken in Python 2.1 (it
2003	# doesn't report any), and slightly broken in Python 2.2 (it
2004	# doesn't report the xml: namespace). So we match up namespaces
2005	# with a known list first, and then possibly override them with
2006	# the qnames the SAX parser gives us (if indeed it gives us any
2007	# at all). Thanks to MatejC for helping me test this and
2008	# tirelessly telling me that it didn't work yet.
2009	attrsD, self.decls = self.decls, {}
2010	if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
2011	attrsD['xmlns']=namespace
2012	if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
2013	attrsD['xmlns']=namespace
2014
2015	if prefix:
2016	localname = prefix.lower() + ':' + localname
2017	elif namespace and not qname: #Expat
2018	for name,value in self.namespacesInUse.items():
2019	if name and value == namespace:
2020	localname = name + ':' + localname
2021	break
2022
2023	for (namespace, attrlocalname), attrvalue in attrs.items():
2024	lowernamespace = (namespace or '').lower()
2025	prefix = self._matchnamespaces.get(lowernamespace, '')
2026	if prefix:
2027	attrlocalname = prefix + ':' + attrlocalname
2028	attrsD[str(attrlocalname).lower()] = attrvalue
2029	for qname in attrs.getQNames():
2030	attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
2031	localname = str(localname).lower()
2032	self.unknown_starttag(localname, attrsD.items())
2033
2034	def characters(self, text):
2035	self.handle_data(text)
2036
2037	def endElementNS(self, name, qname):
2038	namespace, localname = name
2039	lowernamespace = str(namespace or '').lower()
2040	if qname and qname.find(':') > 0:
2041	givenprefix = qname.split(':')[0]
2042	else:
2043	givenprefix = ''
2044	prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
2045	if prefix:
2046	localname = prefix + ':' + localname
2047	elif namespace and not qname: #Expat
2048	for name,value in self.namespacesInUse.items():
2049	if name and value == namespace:
2050	localname = name + ':' + localname
2051	break
2052	localname = str(localname).lower()
2053	self.unknown_endtag(localname)
2054
2055	def error(self, exc):
2056	self.bozo = 1
2057	self.exc = exc
2058
2059	# drv_libxml2 calls warning() in some cases
2060	warning = error
2061
2062	def fatalError(self, exc):
2063	self.error(exc)
2064	raise exc
2065
2066	class _BaseHTMLProcessor(sgmllib.SGMLParser):
2067	special = re.compile('''[<>'"]''')
2068	bare_ampersand = re.compile("&(?!#\d+;\|#x[0-9a-fA-F]+;\|\w+;)")
2069	elements_no_end_tag = set([
2070	'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
2071	'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
2072	'source', 'track', 'wbr'
2073	])
2074
2075	def __init__(self, encoding, _type):
2076	self.encoding = encoding
2077	self._type = _type
2078	sgmllib.SGMLParser.__init__(self)
2079
2080	def reset(self):
2081	self.pieces = []
2082	sgmllib.SGMLParser.reset(self)
2083
2084	def _shorttag_replace(self, match):
2085	tag = match.group(1)
2086	if tag in self.elements_no_end_tag:
2087	return '<' + tag + ' />'
2088	else:
2089	return '<' + tag + '></' + tag + '>'
2090
2091	# By declaring these methods and overriding their compiled code
2092	# with the code from sgmllib, the original code will execute in
2093	# feedparser's scope instead of sgmllib's. This means that the
2094	# `tagfind` and `charref` regular expressions will be found as
2095	# they're declared above, not as they're declared in sgmllib.
2096	def goahead(self, i):
2097	pass
2098	goahead.func_code = sgmllib.SGMLParser.goahead.func_code
2099
2100	def __parse_starttag(self, i):
2101	pass
2102	__parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
2103
2104	def parse_starttag(self,i):
2105	j = self.__parse_starttag(i)
2106	if self._type == 'application/xhtml+xml':
2107	if j>2 and self.rawdata[j-2:j]=='/>':
2108	self.unknown_endtag(self.lasttag)
2109	return j
2110
2111	def feed(self, data):
2112	data = re.compile(r'<!((?!DOCTYPE\|--\|\[))', re.IGNORECASE).sub(r'<!\1', data)
2113	data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
2114	data = data.replace(''', "'")
2115	data = data.replace('"', '"')
2116	try:
2117	bytes
2118	if bytes is str:
2119	raise NameError
2120	self.encoding = self.encoding + u'_INVALID_PYTHON_3'
2121	except NameError:
2122	if self.encoding and isinstance(data, unicode):
2123	data = data.encode(self.encoding)
2124	sgmllib.SGMLParser.feed(self, data)
2125	sgmllib.SGMLParser.close(self)
2126
2127	def normalize_attrs(self, attrs):
2128	if not attrs:
2129	return attrs
2130	# utility method to be called by descendants
2131	attrs = dict([(k.lower(), v) for k, v in attrs]).items()
2132	attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
2133	attrs.sort()
2134	return attrs
2135
2136	def unknown_starttag(self, tag, attrs):
2137	# called for each start tag
2138	# attrs is a list of (attr, value) tuples
2139	# e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
2140	uattrs = []
2141	strattrs=''
2142	if attrs:
2143	for key, value in attrs:
2144	value=value.replace('>','>').replace('<','<').replace('"','"')
2145	value = self.bare_ampersand.sub("&", value)
2146	# thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
2147	if not isinstance(value, unicode):
2148	value = value.decode(self.encoding, 'ignore')
2149	try:
2150	# Currently, in Python 3 the key is already a str, and cannot be decoded again
2151	uattrs.append((unicode(key, self.encoding), value))
2152	except TypeError:
2153	uattrs.append((key, value))
2154	strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
2155	if self.encoding:
2156	try:
2157	strattrs = strattrs.encode(self.encoding)
2158	except (UnicodeEncodeError, LookupError):
2159	pass
2160	if tag in self.elements_no_end_tag:
2161	self.pieces.append('<%s%s />' % (tag, strattrs))
2162	else:
2163	self.pieces.append('<%s%s>' % (tag, strattrs))
2164
2165	def unknown_endtag(self, tag):
2166	# called for each end tag, e.g. for </pre>, tag will be 'pre'
2167	# Reconstruct the original end tag.
2168	if tag not in self.elements_no_end_tag:
2169	self.pieces.append("</%s>" % tag)
2170
2171	def handle_charref(self, ref):
2172	# called for each character reference, e.g. for ' ', ref will be '160'
2173	# Reconstruct the original character reference.
2174	ref = ref.lower()
2175	if ref.startswith('x'):
2176	value = int(ref[1:], 16)
2177	else:
2178	value = int(ref)
2179
2180	if value in _cp1252:
2181	self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
2182	else:
2183	self.pieces.append('&#%s;' % ref)
2184
2185	def handle_entityref(self, ref):
2186	# called for each entity reference, e.g. for '©', ref will be 'copy'
2187	# Reconstruct the original entity reference.
2188	if ref in name2codepoint or ref == 'apos':
2189	self.pieces.append('&%s;' % ref)
2190	else:
2191	self.pieces.append('&%s' % ref)
2192
2193	def handle_data(self, text):
2194	# called for each block of plain text, i.e. outside of any tag and
2195	# not containing any character or entity references
2196	# Store the original text verbatim.
2197	self.pieces.append(text)
2198
2199	def handle_comment(self, text):
2200	# called for each HTML comment, e.g. <!-- insert Javascript code here -->
2201	# Reconstruct the original comment.
2202	self.pieces.append('<!--%s-->' % text)
2203
2204	def handle_pi(self, text):
2205	# called for each processing instruction, e.g. <?instruction>
2206	# Reconstruct original processing instruction.
2207	self.pieces.append('<?%s>' % text)
2208
2209	def handle_decl(self, text):
2210	# called for the DOCTYPE, if present, e.g.
2211	# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
2212	# "http://www.w3.org/TR/html4/loose.dtd">
2213	# Reconstruct original DOCTYPE
2214	self.pieces.append('<!%s>' % text)
2215
2216	_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]\s').match
2217	def _scan_name(self, i, declstartpos):
2218	rawdata = self.rawdata
2219	n = len(rawdata)
2220	if i == n:
2221	return None, -1
2222	m = self._new_declname_match(rawdata, i)
2223	if m:
2224	s = m.group()
2225	name = s.strip()
2226	if (i + len(s)) == n:
2227	return None, -1 # end of buffer
2228	return name.lower(), m.end()
2229	else:
2230	self.handle_data(rawdata)
2231	# self.updatepos(declstartpos, i)
2232	return None, -1
2233
2234	def convert_charref(self, name):
2235	return '&#%s;' % name
2236
2237	def convert_entityref(self, name):
2238	return '&%s;' % name
2239
2240	def output(self):
2241	'''Return processed HTML as a single string'''
2242	return ''.join([str(p) for p in self.pieces])
2243
2244	def parse_declaration(self, i):
2245	try:
2246	return sgmllib.SGMLParser.parse_declaration(self, i)
2247	except sgmllib.SGMLParseError:
2248	# escape the doctype declaration and continue parsing
2249	self.handle_data('<')
2250	return i+1
2251
2252	class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
2253	def __init__(self, baseuri, baselang, encoding, entities):
2254	sgmllib.SGMLParser.__init__(self)
2255	_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
2256	_BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
2257	self.entities=entities
2258
2259	def decodeEntities(self, element, data):
2260	data = data.replace('<', '<')
2261	data = data.replace('<', '<')
2262	data = data.replace('<', '<')
2263	data = data.replace('>', '>')
2264	data = data.replace('>', '>')
2265	data = data.replace('>', '>')
2266	data = data.replace('&', '&')
2267	data = data.replace('&', '&')
2268	data = data.replace('"', '"')
2269	data = data.replace('"', '"')
2270	data = data.replace(''', ''')
2271	data = data.replace(''', ''')
2272	if not self.contentparams.get('type', u'xml').endswith(u'xml'):
2273	data = data.replace('<', '<')
2274	data = data.replace('>', '>')
2275	data = data.replace('&', '&')
2276	data = data.replace('"', '"')
2277	data = data.replace(''', "'")
2278	data = data.replace('/', '/')
2279	data = data.replace('/', '/')
2280	return data
2281
2282	def strattrs(self, attrs):
2283	return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
2284
2285	class _RelativeURIResolver(_BaseHTMLProcessor):
2286	relative_uris = set([('a', 'href'),
2287	('applet', 'codebase'),
2288	('area', 'href'),
2289	('audio', 'src'),
2290	('blockquote', 'cite'),
2291	('body', 'background'),
2292	('del', 'cite'),
2293	('form', 'action'),
2294	('frame', 'longdesc'),
2295	('frame', 'src'),
2296	('iframe', 'longdesc'),
2297	('iframe', 'src'),
2298	('head', 'profile'),
2299	('img', 'longdesc'),
2300	('img', 'src'),
2301	('img', 'usemap'),
2302	('input', 'src'),
2303	('input', 'usemap'),
2304	('ins', 'cite'),
2305	('link', 'href'),
2306	('object', 'classid'),
2307	('object', 'codebase'),
2308	('object', 'data'),
2309	('object', 'usemap'),
2310	('q', 'cite'),
2311	('script', 'src'),
2312	('source', 'src'),
2313	('video', 'poster'),
2314	('video', 'src')])
2315
2316	def __init__(self, baseuri, encoding, _type):
2317	_BaseHTMLProcessor.__init__(self, encoding, _type)
2318	self.baseuri = baseuri
2319
2320	def resolveURI(self, uri):
2321	return _makeSafeAbsoluteURI(self.baseuri, uri.strip())
2322
2323	def unknown_starttag(self, tag, attrs):
2324	attrs = self.normalize_attrs(attrs)
2325	attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
2326	_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
2327
2328	def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
2329	if not _SGML_AVAILABLE:
2330	return htmlSource
2331
2332	p = _RelativeURIResolver(baseURI, encoding, _type)
2333	p.feed(htmlSource)
2334	return p.output()
2335
2336	def _makeSafeAbsoluteURI(base, rel=None):
2337	# bail if ACCEPTABLE_URI_SCHEMES is empty
2338	if not ACCEPTABLE_URI_SCHEMES:
2339	return _urljoin(base, rel or u'')
2340	if not base:
2341	return rel or u''
2342	if not rel:
2343	try:
2344	scheme = urlparse.urlparse(base)[0]
2345	except ValueError:
2346	return u''
2347	if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
2348	return base
2349	return u''
2350	uri = _urljoin(base, rel)
2351	if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
2352	return u''
2353	return uri
2354
2355	class _HTMLSanitizer(_BaseHTMLProcessor):
2356	acceptable_elements = set(['a', 'abbr', 'acronym', 'address', 'area',
2357	'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
2358	'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
2359	'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
2360	'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
2361	'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
2362	'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
2363	'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
2364	'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
2365	'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
2366	'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
2367	'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
2368	'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'])
2369
2370	acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey',
2371	'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
2372	'background', 'balance', 'bgcolor', 'bgproperties', 'border',
2373	'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
2374	'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
2375	'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
2376	'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
2377	'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
2378	'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
2379	'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
2380	'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
2381	'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
2382	'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
2383	'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
2384	'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
2385	'poster', 'pqg', 'preload', 'prompt', 'radiogroup', 'readonly', 'rel',
2386	'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing',
2387	'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span',
2388	'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target',
2389	'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
2390	'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
2391	'width', 'wrap', 'xml:lang'])
2392
2393	unacceptable_elements_with_end_tag = set(['script', 'applet', 'style'])
2394
2395	acceptable_css_properties = set(['azimuth', 'background-color',
2396	'border-bottom-color', 'border-collapse', 'border-color',
2397	'border-left-color', 'border-right-color', 'border-top-color', 'clear',
2398	'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
2399	'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
2400	'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
2401	'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
2402	'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
2403	'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
2404	'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
2405	'white-space', 'width'])
2406
2407	# survey of common keywords found in feeds
2408	acceptable_css_keywords = set(['auto', 'aqua', 'black', 'block', 'blue',
2409	'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
2410	'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
2411	'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
2412	'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
2413	'transparent', 'underline', 'white', 'yellow'])
2414
2415	valid_css_values = re.compile('^(#[0-9a-f]+\|rgb\(\d+%?,\d%?,?\d%?\)?\|' +
2416	'\d{0,2}\.?\d{0,2}(cm\|em\|ex\|in\|mm\|pc\|pt\|px\|%\|,\|\))?)$')
2417
2418	mathml_elements = set([
2419	'annotation',
2420	'annotation-xml',
2421	'maction',
2422	'maligngroup',
2423	'malignmark',
2424	'math',
2425	'menclose',
2426	'merror',
2427	'mfenced',
2428	'mfrac',
2429	'mglyph',
2430	'mi',
2431	'mlabeledtr',
2432	'mlongdiv',
2433	'mmultiscripts',
2434	'mn',
2435	'mo',
2436	'mover',
2437	'mpadded',
2438	'mphantom',
2439	'mprescripts',
2440	'mroot',
2441	'mrow',
2442	'ms',
2443	'mscarries',
2444	'mscarry',
2445	'msgroup',
2446	'msline',
2447	'mspace',
2448	'msqrt',
2449	'msrow',
2450	'mstack',
2451	'mstyle',
2452	'msub',
2453	'msubsup',
2454	'msup',
2455	'mtable',
2456	'mtd',
2457	'mtext',
2458	'mtr',
2459	'munder',
2460	'munderover',
2461	'none',
2462	'semantics',
2463	])
2464
2465	mathml_attributes = set([
2466	'accent',
2467	'accentunder',
2468	'actiontype',
2469	'align',
2470	'alignmentscope',
2471	'altimg',
2472	'altimg-height',
2473	'altimg-valign',
2474	'altimg-width',
2475	'alttext',
2476	'bevelled',
2477	'charalign',
2478	'close',
2479	'columnalign',
2480	'columnlines',
2481	'columnspacing',
2482	'columnspan',
2483	'columnwidth',
2484	'crossout',
2485	'decimalpoint',
2486	'denomalign',
2487	'depth',
2488	'dir',
2489	'display',
2490	'displaystyle',
2491	'edge',
2492	'encoding',
2493	'equalcolumns',
2494	'equalrows',
2495	'fence',
2496	'fontstyle',
2497	'fontweight',
2498	'form',
2499	'frame',
2500	'framespacing',
2501	'groupalign',
2502	'height',
2503	'href',
2504	'id',
2505	'indentalign',
2506	'indentalignfirst',
2507	'indentalignlast',
2508	'indentshift',
2509	'indentshiftfirst',
2510	'indentshiftlast',
2511	'indenttarget',
2512	'infixlinebreakstyle',
2513	'largeop',
2514	'length',
2515	'linebreak',
2516	'linebreakmultchar',
2517	'linebreakstyle',
2518	'lineleading',
2519	'linethickness',
2520	'location',
2521	'longdivstyle',
2522	'lquote',
2523	'lspace',
2524	'mathbackground',
2525	'mathcolor',
2526	'mathsize',
2527	'mathvariant',
2528	'maxsize',
2529	'minlabelspacing',
2530	'minsize',
2531	'movablelimits',
2532	'notation',
2533	'numalign',
2534	'open',
2535	'other',
2536	'overflow',
2537	'position',
2538	'rowalign',
2539	'rowlines',
2540	'rowspacing',
2541	'rowspan',
2542	'rquote',
2543	'rspace',
2544	'scriptlevel',
2545	'scriptminsize',
2546	'scriptsizemultiplier',
2547	'selection',
2548	'separator',
2549	'separators',
2550	'shift',
2551	'side',
2552	'src',
2553	'stackalign',
2554	'stretchy',
2555	'subscriptshift',
2556	'superscriptshift',
2557	'symmetric',
2558	'voffset',
2559	'width',
2560	'xlink:href',
2561	'xlink:show',
2562	'xlink:type',
2563	'xmlns',
2564	'xmlns:xlink',
2565	])
2566
2567	# svgtiny - foreignObject + linearGradient + radialGradient + stop
2568	svg_elements = set(['a', 'animate', 'animateColor', 'animateMotion',
2569	'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
2570	'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
2571	'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
2572	'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
2573	'svg', 'switch', 'text', 'title', 'tspan', 'use'])
2574
2575	# svgtiny + class + opacity + offset + xmlns + xmlns:xlink
2576	svg_attributes = set(['accent-height', 'accumulate', 'additive', 'alphabetic',
2577	'arabic-form', 'ascent', 'attributeName', 'attributeType',
2578	'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
2579	'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
2580	'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
2581	'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
2582	'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
2583	'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
2584	'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
2585	'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
2586	'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
2587	'min', 'name', 'offset', 'opacity', 'orient', 'origin',
2588	'overline-position', 'overline-thickness', 'panose-1', 'path',
2589	'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
2590	'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
2591	'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
2592	'stop-color', 'stop-opacity', 'strikethrough-position',
2593	'strikethrough-thickness', 'stroke', 'stroke-dasharray',
2594	'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
2595	'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
2596	'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
2597	'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
2598	'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
2599	'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
2600	'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
2601	'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
2602	'y2', 'zoomAndPan'])
2603
2604	svg_attr_map = None
2605	svg_elem_map = None
2606
2607	acceptable_svg_properties = set([ 'fill', 'fill-opacity', 'fill-rule',
2608	'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
2609	'stroke-opacity'])
2610
2611	def reset(self):
2612	_BaseHTMLProcessor.reset(self)
2613	self.unacceptablestack = 0
2614	self.mathmlOK = 0
2615	self.svgOK = 0
2616
2617	def unknown_starttag(self, tag, attrs):
2618	acceptable_attributes = self.acceptable_attributes
2619	keymap = {}
2620	if not tag in self.acceptable_elements or self.svgOK:
2621	if tag in self.unacceptable_elements_with_end_tag:
2622	self.unacceptablestack += 1
2623
2624	# add implicit namespaces to html5 inline svg/mathml
2625	if self._type.endswith('html'):
2626	if not dict(attrs).get('xmlns'):
2627	if tag=='svg':
2628	attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
2629	if tag=='math':
2630	attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
2631
2632	# not otherwise acceptable, perhaps it is MathML or SVG?
2633	if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
2634	self.mathmlOK += 1
2635	if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
2636	self.svgOK += 1
2637
2638	# chose acceptable attributes based on tag class, else bail
2639	if self.mathmlOK and tag in self.mathml_elements:
2640	acceptable_attributes = self.mathml_attributes
2641	elif self.svgOK and tag in self.svg_elements:
2642	# for most vocabularies, lowercasing is a good idea. Many
2643	# svg elements, however, are camel case
2644	if not self.svg_attr_map:
2645	lower=[attr.lower() for attr in self.svg_attributes]
2646	mix=[a for a in self.svg_attributes if a not in lower]
2647	self.svg_attributes = lower
2648	self.svg_attr_map = dict([(a.lower(),a) for a in mix])
2649
2650	lower=[attr.lower() for attr in self.svg_elements]
2651	mix=[a for a in self.svg_elements if a not in lower]
2652	self.svg_elements = lower
2653	self.svg_elem_map = dict([(a.lower(),a) for a in mix])
2654	acceptable_attributes = self.svg_attributes
2655	tag = self.svg_elem_map.get(tag,tag)
2656	keymap = self.svg_attr_map
2657	elif not tag in self.acceptable_elements:
2658	return
2659
2660	# declare xlink namespace, if needed
2661	if self.mathmlOK or self.svgOK:
2662	if filter(lambda (n,v): n.startswith('xlink:'),attrs):
2663	if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
2664	attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
2665
2666	clean_attrs = []
2667	for key, value in self.normalize_attrs(attrs):
2668	if key in acceptable_attributes:
2669	key=keymap.get(key,key)
2670	# make sure the uri uses an acceptable uri scheme
2671	if key == u'href':
2672	value = _makeSafeAbsoluteURI(value)
2673	clean_attrs.append((key,value))
2674	elif key=='style':
2675	clean_value = self.sanitize_style(value)
2676	if clean_value:
2677	clean_attrs.append((key,clean_value))
2678	_BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
2679
2680	def unknown_endtag(self, tag):
2681	if not tag in self.acceptable_elements:
2682	if tag in self.unacceptable_elements_with_end_tag:
2683	self.unacceptablestack -= 1
2684	if self.mathmlOK and tag in self.mathml_elements:
2685	if tag == 'math' and self.mathmlOK:
2686	self.mathmlOK -= 1
2687	elif self.svgOK and tag in self.svg_elements:
2688	tag = self.svg_elem_map.get(tag,tag)
2689	if tag == 'svg' and self.svgOK:
2690	self.svgOK -= 1
2691	else:
2692	return
2693	_BaseHTMLProcessor.unknown_endtag(self, tag)
2694
2695	def handle_pi(self, text):
2696	pass
2697
2698	def handle_decl(self, text):
2699	pass
2700
2701	def handle_data(self, text):
2702	if not self.unacceptablestack:
2703	_BaseHTMLProcessor.handle_data(self, text)
2704
2705	def sanitize_style(self, style):
2706	# disallow urls
2707	style=re.compile('url\s\(\s[^\s)]+?\s\)\s').sub(' ',style)
2708
2709	# gauntlet
2710	if not re.match("""^([:,;#%.\sa-zA-Z0-9!]\|\w-\w\|'[\s\w]+'\|"[\s\w]+"\|\([\d,\s]+\))*$""", style):
2711	return ''
2712	# This replaced a regexp that used re.match and was prone to pathological back-tracking.
2713	if re.sub("\s[-\w]+\s:\s[^:;];?", '', style).strip():
2714	return ''
2715
2716	clean = []
2717	for prop,value in re.findall("([-\w]+)\s:\s([^:;]*)",style):
2718	if not value:
2719	continue
2720	if prop.lower() in self.acceptable_css_properties:
2721	clean.append(prop + ': ' + value + ';')
2722	elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
2723	for keyword in value.split():
2724	if not keyword in self.acceptable_css_keywords and \
2725	not self.valid_css_values.match(keyword):
2726	break
2727	else:
2728	clean.append(prop + ': ' + value + ';')
2729	elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
2730	clean.append(prop + ': ' + value + ';')
2731
2732	return ' '.join(clean)
2733
2734	def parse_comment(self, i, report=1):
2735	ret = _BaseHTMLProcessor.parse_comment(self, i, report)
2736	if ret >= 0:
2737	return ret
2738	# if ret == -1, this may be a malicious attempt to circumvent
2739	# sanitization, or a page-destroying unclosed comment
2740	match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
2741	if match:
2742	return match.end()
2743	# unclosed comment; deliberately fail to handle_data()
2744	return len(self.rawdata)
2745
2746
2747	def _sanitizeHTML(htmlSource, encoding, _type):
2748	if not _SGML_AVAILABLE:
2749	return htmlSource
2750	p = _HTMLSanitizer(encoding, _type)
2751	htmlSource = htmlSource.replace('<![CDATA[', '<![CDATA[')
2752	p.feed(htmlSource)
2753	data = p.output()
2754	data = data.strip().replace('\r\n', '\n')
2755	return data
2756
2757	class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
2758	def http_error_default(self, req, fp, code, msg, headers):
2759	# The default implementation just raises HTTPError.
2760	# Forget that.
2761	fp.status = code
2762	return fp
2763
2764	def http_error_301(self, req, fp, code, msg, hdrs):
2765	result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp,
2766	code, msg, hdrs)
2767	result.status = code
2768	result.newurl = result.geturl()
2769	return result
2770	# The default implementations in urllib2.HTTPRedirectHandler
2771	# are identical, so hardcoding a http_error_301 call above
2772	# won't affect anything
2773	http_error_300 = http_error_301
2774	http_error_302 = http_error_301
2775	http_error_303 = http_error_301
2776	http_error_307 = http_error_301
2777
2778	def http_error_401(self, req, fp, code, msg, headers):
2779	# Check if
2780	# - server requires digest auth, AND
2781	# - we tried (unsuccessfully) with basic auth, AND
2782	# If all conditions hold, parse authentication information
2783	# out of the Authorization header we sent the first time
2784	# (for the username and password) and the WWW-Authenticate
2785	# header the server sent back (for the realm) and retry
2786	# the request with the appropriate digest auth headers instead.
2787	# This evil genius hack has been brought to you by Aaron Swartz.
2788	host = urlparse.urlparse(req.get_full_url())[1]
2789	if base64 is None or 'Authorization' not in req.headers \
2790	or 'WWW-Authenticate' not in headers:
2791	return self.http_error_default(req, fp, code, msg, headers)
2792	auth = _base64decode(req.headers['Authorization'].split(' ')[1])
2793	user, passw = auth.split(':')
2794	realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
2795	self.add_password(realm, host, user, passw)
2796	retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
2797	self.reset_retry_count()
2798	return retry
2799
2800	def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
2801	"""URL, filename, or string --> stream
2802
2803	This function lets you define parsers that take any input source
2804	(URL, pathname to local or network file, or actual data as a string)
2805	and deal with it in a uniform manner. Returned object is guaranteed
2806	to have all the basic stdio read methods (read, readline, readlines).
2807	Just .close() the object when you're done with it.
2808
2809	If the etag argument is supplied, it will be used as the value of an
2810	If-None-Match request header.
2811
2812	If the modified argument is supplied, it can be a tuple of 9 integers
2813	(as returned by gmtime() in the standard Python time module) or a date
2814	string in any format supported by feedparser. Regardless, it MUST
2815	be in GMT (Greenwich Mean Time). It will be reformatted into an
2816	RFC 1123-compliant date and used as the value of an If-Modified-Since
2817	request header.
2818
2819	If the agent argument is supplied, it will be used as the value of a
2820	User-Agent request header.
2821
2822	If the referrer argument is supplied, it will be used as the value of a
2823	Referer[sic] request header.
2824
2825	If handlers is supplied, it is a list of handlers used to build a
2826	urllib2 opener.
2827
2828	if request_headers is supplied it is a dictionary of HTTP request headers
2829	that will override the values generated by FeedParser.
2830
2831	:return: A :class:`StringIO.StringIO` or :class:`io.BytesIO`.
2832	"""
2833
2834	if hasattr(url_file_stream_or_string, 'read'):
2835	return url_file_stream_or_string
2836
2837	if isinstance(url_file_stream_or_string, basestring) \
2838	and urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
2839	# Deal with the feed URI scheme
2840	if url_file_stream_or_string.startswith('feed:http'):
2841	url_file_stream_or_string = url_file_stream_or_string[5:]
2842	elif url_file_stream_or_string.startswith('feed:'):
2843	url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
2844	if not agent:
2845	agent = USER_AGENT
2846	# Test for inline user:password credentials for HTTP basic auth
2847	auth = None
2848	if base64 and not url_file_stream_or_string.startswith('ftp:'):
2849	urltype, rest = urllib.splittype(url_file_stream_or_string)
2850	realhost, rest = urllib.splithost(rest)
2851	if realhost:
2852	user_passwd, realhost = urllib.splituser(realhost)
2853	if user_passwd:
2854	url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
2855	auth = base64.standard_b64encode(user_passwd).strip()
2856
2857	# iri support
2858	if isinstance(url_file_stream_or_string, unicode):
2859	url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string)
2860
2861	# try to open with urllib2 (to use optional headers)
2862	request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
2863	opener = urllib2.build_opener(*tuple(handlers + [_FeedURLHandler()]))
2864	opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
2865	try:
2866	return opener.open(request)
2867	finally:
2868	opener.close() # JohnD
2869
2870	# try to open with native open function (if url_file_stream_or_string is a filename)
2871	try:
2872	return open(url_file_stream_or_string, 'rb')
2873	except (IOError, UnicodeEncodeError, TypeError):
2874	# if url_file_stream_or_string is a unicode object that
2875	# cannot be converted to the encoding returned by
2876	# sys.getfilesystemencoding(), a UnicodeEncodeError
2877	# will be thrown
2878	# If url_file_stream_or_string is a string that contains NULL
2879	# (such as an XML document encoded in UTF-32), TypeError will
2880	# be thrown.
2881	pass
2882
2883	# treat url_file_stream_or_string as string
2884	if isinstance(url_file_stream_or_string, unicode):
2885	return _StringIO(url_file_stream_or_string.encode('utf-8'))
2886	return _StringIO(url_file_stream_or_string)
2887
2888	def _convert_to_idn(url):
2889	"""Convert a URL to IDN notation"""
2890	# this function should only be called with a unicode string
2891	# strategy: if the host cannot be encoded in ascii, then
2892	# it'll be necessary to encode it in idn form
2893	parts = list(urlparse.urlsplit(url))
2894	try:
2895	parts[1].encode('ascii')
2896	except UnicodeEncodeError:
2897	# the url needs to be converted to idn notation
2898	host = parts[1].rsplit(':', 1)
2899	newhost = []
2900	port = u''
2901	if len(host) == 2:
2902	port = host.pop()
2903	for h in host[0].split('.'):
2904	newhost.append(h.encode('idna').decode('utf-8'))
2905	parts[1] = '.'.join(newhost)
2906	if port:
2907	parts[1] += ':' + port
2908	return urlparse.urlunsplit(parts)
2909	else:
2910	return url
2911
2912	def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
2913	request = urllib2.Request(url)
2914	request.add_header('User-Agent', agent)
2915	if etag:
2916	request.add_header('If-None-Match', etag)
2917	if isinstance(modified, basestring):
2918	modified = _parse_date(modified)
2919	elif isinstance(modified, datetime.datetime):
2920	modified = modified.utctimetuple()
2921	if modified:
2922	# format into an RFC 1123-compliant timestamp. We can't use
2923	# time.strftime() since the %a and %b directives can be affected
2924	# by the current locale, but RFC 2616 states that dates must be
2925	# in English.
2926	short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
2927	months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
2928	request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
2929	if referrer:
2930	request.add_header('Referer', referrer)
2931	if gzip and zlib:
2932	request.add_header('Accept-encoding', 'gzip, deflate')
2933	elif gzip:
2934	request.add_header('Accept-encoding', 'gzip')
2935	elif zlib:
2936	request.add_header('Accept-encoding', 'deflate')
2937	else:
2938	request.add_header('Accept-encoding', '')
2939	if auth:
2940	request.add_header('Authorization', 'Basic %s' % auth)
2941	if ACCEPT_HEADER:
2942	request.add_header('Accept', ACCEPT_HEADER)
2943	# use this for whatever -- cookies, special headers, etc
2944	# [('Cookie','Something'),('x-special-header','Another Value')]
2945	for header_name, header_value in request_headers.items():
2946	request.add_header(header_name, header_value)
2947	request.add_header('A-IM', 'feed') # RFC 3229 support
2948	return request
2949
2950	def _parse_psc_chapter_start(start):
2951	FORMAT = r'^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$'
2952
2953	m = re.compile(FORMAT).match(start)
2954	if m is None:
2955	return None
2956
2957	_, h, m, s, _, ms = m.groups()
2958	h, m, s, ms = (int(h or 0), int(m), int(s), int(ms or 0))
2959	return datetime.timedelta(0, h6060 + m60 + s, ms1000)
2960
2961	_date_handlers = []
2962	def registerDateHandler(func):
2963	'''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
2964	_date_handlers.insert(0, func)
2965
2966	# ISO-8601 date parsing routines written by Fazal Majid.
2967	# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2968	# parser is beyond the scope of feedparser and would be a worthwhile addition
2969	# to the Python library.
2970	# A single regular expression cannot parse ISO 8601 date formats into groups
2971	# as the standard is highly irregular (for instance is 030104 2003-01-04 or
2972	# 0301-04-01), so we use templates instead.
2973	# Please note the order in templates is significant because we need a
2974	# greedy match.
2975	_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
2976	'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
2977	'-YY-?MM', '-OOO', '-YY',
2978	'--MM-?DD', '--MM',
2979	'---DD',
2980	'CC', '']
2981	_iso8601_re = [
2982	tmpl.replace(
2983	'YYYY', r'(?P<year>\d{4})').replace(
2984	'YY', r'(?P<year>\d\d)').replace(
2985	'MM', r'(?P<month>[01]\d)').replace(
2986	'DD', r'(?P<day>[0123]\d)').replace(
2987	'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
2988	'CC', r'(?P<century>\d\d$)')
2989	+ r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
2990	+ r'(:(?P<second>\d{2}))?'
2991	+ r'(\.(?P<fracsecond>\d+))?'
2992	+ r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?\|Z)?)?'
2993	for tmpl in _iso8601_tmpl]
2994	try:
2995	del tmpl
2996	except NameError:
2997	pass
2998	_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
2999	try:
3000	del regex
3001	except NameError:
3002	pass
3003
3004	def _parse_date_iso8601(dateString):
3005	'''Parse a variety of ISO-8601-compatible formats like 20040105'''
3006	m = None
3007	for _iso8601_match in _iso8601_matches:
3008	m = _iso8601_match(dateString)
3009	if m:
3010	break
3011	if not m:
3012	return
3013	if m.span() == (0, 0):
3014	return
3015	params = m.groupdict()
3016	ordinal = params.get('ordinal', 0)
3017	if ordinal:
3018	ordinal = int(ordinal)
3019	else:
3020	ordinal = 0
3021	year = params.get('year', '--')
3022	if not year or year == '--':
3023	year = time.gmtime()[0]
3024	elif len(year) == 2:
3025	# ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
3026	year = 100 * int(time.gmtime()[0] / 100) + int(year)
3027	else:
3028	year = int(year)
3029	month = params.get('month', '-')
3030	if not month or month == '-':
3031	# ordinals are NOT normalized by mktime, we simulate them
3032	# by setting month=1, day=ordinal
3033	if ordinal:
3034	month = 1
3035	else:
3036	month = time.gmtime()[1]
3037	month = int(month)
3038	day = params.get('day', 0)
3039	if not day:
3040	# see above
3041	if ordinal:
3042	day = ordinal
3043	elif params.get('century', 0) or \
3044	params.get('year', 0) or params.get('month', 0):
3045	day = 1
3046	else:
3047	day = time.gmtime()[2]
3048	else:
3049	day = int(day)
3050	# special case of the century - is the first year of the 21st century
3051	# 2000 or 2001 ? The debate goes on...
3052	if 'century' in params:
3053	year = (int(params['century']) - 1) * 100 + 1
3054	# in ISO 8601 most fields are optional
3055	for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
3056	if not params.get(field, None):
3057	params[field] = 0
3058	hour = int(params.get('hour', 0))
3059	minute = int(params.get('minute', 0))
3060	second = int(float(params.get('second', 0)))
3061	# weekday is normalized by mktime(), we can ignore it
3062	weekday = 0
3063	daylight_savings_flag = -1
3064	tm = [year, month, day, hour, minute, second, weekday,
3065	ordinal, daylight_savings_flag]
3066	# ISO 8601 time zone adjustments
3067	tz = params.get('tz')
3068	if tz and tz != 'Z':
3069	if tz[0] == '-':
3070	tm[3] += int(params.get('tzhour', 0))
3071	tm[4] += int(params.get('tzmin', 0))
3072	elif tz[0] == '+':
3073	tm[3] -= int(params.get('tzhour', 0))
3074	tm[4] -= int(params.get('tzmin', 0))
3075	else:
3076	return None
3077	# Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
3078	# which is guaranteed to normalize d/m/y/h/m/s.
3079	# Many implementations have bugs, but we'll pretend they don't.
3080	return time.localtime(time.mktime(tuple(tm)))
3081	registerDateHandler(_parse_date_iso8601)
3082
3083	# 8-bit date handling routines written by ytrewq1.
3084	_korean_year = u'\ub144' # b3e2 in euc-kr
3085	_korean_month = u'\uc6d4' # bff9 in euc-kr
3086	_korean_day = u'\uc77c' # c0cf in euc-kr
3087	_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
3088	_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
3089
3090	_korean_onblog_date_re = \
3091	re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
3092	(_korean_year, _korean_month, _korean_day))
3093	_korean_nate_date_re = \
3094	re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s\|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
3095	(_korean_am, _korean_pm))
3096	def _parse_date_onblog(dateString):
3097	'''Parse a string according to the OnBlog 8-bit date format'''
3098	m = _korean_onblog_date_re.match(dateString)
3099	if not m:
3100	return
3101	w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3102	{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3103	'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
3104	'zonediff': '+09:00'}
3105	return _parse_date_w3dtf(w3dtfdate)
3106	registerDateHandler(_parse_date_onblog)
3107
3108	def _parse_date_nate(dateString):
3109	'''Parse a string according to the Nate 8-bit date format'''
3110	m = _korean_nate_date_re.match(dateString)
3111	if not m:
3112	return
3113	hour = int(m.group(5))
3114	ampm = m.group(4)
3115	if (ampm == _korean_pm):
3116	hour += 12
3117	hour = str(hour)
3118	if len(hour) == 1:
3119	hour = '0' + hour
3120	w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3121	{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3122	'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
3123	'zonediff': '+09:00'}
3124	return _parse_date_w3dtf(w3dtfdate)
3125	registerDateHandler(_parse_date_nate)
3126
3127	# Unicode strings for Greek date strings
3128	_greek_months = \
3129	{ \
3130	u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
3131	u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
3132	u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
3133	u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
3134	u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
3135	u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
3136	u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
3137	u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
3138	u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
3139	u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
3140	u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
3141	u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
3142	u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
3143	u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
3144	u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
3145	u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
3146	u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
3147	u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
3148	u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
3149	}
3150
3151	_greek_wdays = \
3152	{ \
3153	u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
3154	u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
3155	u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
3156	u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
3157	u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
3158	u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
3159	u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
3160	}
3161
3162	_greek_date_format_re = \
3163	re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
3164
3165	def _parse_date_greek(dateString):
3166	'''Parse a string according to a Greek 8-bit date format.'''
3167	m = _greek_date_format_re.match(dateString)
3168	if not m:
3169	return
3170	wday = _greek_wdays[m.group(1)]
3171	month = _greek_months[m.group(3)]
3172	rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
3173	{'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
3174	'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
3175	'zonediff': m.group(8)}
3176	return _parse_date_rfc822(rfc822date)
3177	registerDateHandler(_parse_date_greek)
3178
3179	# Unicode strings for Hungarian date strings
3180	_hungarian_months = \
3181	{ \
3182	u'janu\u00e1r': u'01', # e1 in iso-8859-2
3183	u'febru\u00e1ri': u'02', # e1 in iso-8859-2
3184	u'm\u00e1rcius': u'03', # e1 in iso-8859-2
3185	u'\u00e1prilis': u'04', # e1 in iso-8859-2
3186	u'm\u00e1ujus': u'05', # e1 in iso-8859-2
3187	u'j\u00fanius': u'06', # fa in iso-8859-2
3188	u'j\u00falius': u'07', # fa in iso-8859-2
3189	u'augusztus': u'08',
3190	u'szeptember': u'09',
3191	u'okt\u00f3ber': u'10', # f3 in iso-8859-2
3192	u'november': u'11',
3193	u'december': u'12',
3194	}
3195
3196	_hungarian_date_format_re = \
3197	re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+\|-)(\d{,2}:\d{2}))')
3198
3199	def _parse_date_hungarian(dateString):
3200	'''Parse a string according to a Hungarian 8-bit date format.'''
3201	m = _hungarian_date_format_re.match(dateString)
3202	if not m or m.group(2) not in _hungarian_months:
3203	return None
3204	month = _hungarian_months[m.group(2)]
3205	day = m.group(3)
3206	if len(day) == 1:
3207	day = '0' + day
3208	hour = m.group(4)
3209	if len(hour) == 1:
3210	hour = '0' + hour
3211	w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
3212	{'year': m.group(1), 'month': month, 'day': day,\
3213	'hour': hour, 'minute': m.group(5),\
3214	'zonediff': m.group(6)}
3215	return _parse_date_w3dtf(w3dtfdate)
3216	registerDateHandler(_parse_date_hungarian)
3217
3218	timezonenames = {
3219	'ut': 0, 'gmt': 0, 'z': 0,
3220	'adt': -3, 'ast': -4, 'at': -4,
3221	'edt': -4, 'est': -5, 'et': -5,
3222	'cdt': -5, 'cst': -6, 'ct': -6,
3223	'mdt': -6, 'mst': -7, 'mt': -7,
3224	'pdt': -7, 'pst': -8, 'pt': -8,
3225	'a': -1, 'n': 1,
3226	'm': -12, 'y': 12,
3227	}
3228	# W3 date and time format parser
3229	# http://www.w3.org/TR/NOTE-datetime
3230	# Also supports MSSQL-style datetimes as defined at:
3231	# http://msdn.microsoft.com/en-us/library/ms186724.aspx
3232	# (basically, allow a space as a date/time/timezone separator)
3233	def _parse_date_w3dtf(datestr):
3234	if not datestr.strip():
3235	return None
3236	parts = datestr.lower().split('t')
3237	if len(parts) == 1:
3238	# This may be a date only, or may be an MSSQL-style date
3239	parts = parts[0].split()
3240	if len(parts) == 1:
3241	# Treat this as a date only
3242	parts.append('00:00:00z')
3243	elif len(parts) > 2:
3244	return None
3245	date = parts[0].split('-', 2)
3246	if not date or len(date[0]) != 4:
3247	return None
3248	# Ensure that `date` has 3 elements. Using '1' sets the default
3249	# month to January and the default day to the 1st of the month.
3250	date.extend(['1'] * (3 - len(date)))
3251	try:
3252	year, month, day = [int(i) for i in date]
3253	except ValueError:
3254	# `date` may have more than 3 elements or may contain
3255	# non-integer strings.
3256	return None
3257	if parts[1].endswith('z'):
3258	parts[1] = parts[1][:-1]
3259	parts.append('z')
3260	# Append the numeric timezone offset, if any, to parts.
3261	# If this is an MSSQL-style date then parts[2] already contains
3262	# the timezone information, so `append()` will not affect it.
3263	# Add 1 to each value so that if `find()` returns -1 it will be
3264	# treated as False.
3265	loc = parts[1].find('-') + 1 or parts[1].find('+') + 1 or len(parts[1]) + 1
3266	loc = loc - 1
3267	parts.append(parts[1][loc:])
3268	parts[1] = parts[1][:loc]
3269	time = parts[1].split(':', 2)
3270	# Ensure that time has 3 elements. Using '0' means that the
3271	# minutes and seconds, if missing, will default to 0.
3272	time.extend(['0'] * (3 - len(time)))
3273	tzhour = 0
3274	tzmin = 0
3275	if parts[2][:1] in ('-', '+'):
3276	try:
3277	tzhour = int(parts[2][1:3])
3278	tzmin = int(parts[2][4:])
3279	except ValueError:
3280	return None
3281	if parts[2].startswith('-'):
3282	tzhour = tzhour * -1
3283	tzmin = tzmin * -1
3284	else:
3285	tzhour = timezonenames.get(parts[2], 0)
3286	try:
3287	hour, minute, second = [int(float(i)) for i in time]
3288	except ValueError:
3289	return None
3290	# Create the datetime object and timezone delta objects
3291	try:
3292	stamp = datetime.datetime(year, month, day, hour, minute, second)
3293	except ValueError:
3294	return None
3295	delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
3296	# Return the date and timestamp in a UTC 9-tuple
3297	try:
3298	return (stamp - delta).utctimetuple()
3299	except (OverflowError, ValueError):
3300	# IronPython throws ValueErrors instead of OverflowErrors
3301	return None
3302
3303	registerDateHandler(_parse_date_w3dtf)
3304
3305	def _parse_date_rfc822(date):
3306	"""Parse RFC 822 dates and times
3307	http://tools.ietf.org/html/rfc822#section-5
3308
3309	There are some formatting differences that are accounted for:
3310	1. Years may be two or four digits.
3311	2. The month and day can be swapped.
3312	3. Additional timezone names are supported.
3313	4. A default time and timezone are assumed if only a date is present.
3314	"""
3315	daynames = set(['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'])
3316	months = {
3317	'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
3318	'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
3319	}
3320
3321	parts = date.lower().split()
3322	if len(parts) < 5:
3323	# Assume that the time and timezone are missing
3324	parts.extend(('00:00:00', '0000'))
3325	# Remove the day name
3326	if parts[0][:3] in daynames:
3327	parts = parts[1:]
3328	if len(parts) < 5:
3329	# If there are still fewer than five parts, there's not enough
3330	# information to interpret this
3331	return None
3332	try:
3333	day = int(parts[0])
3334	except ValueError:
3335	# Check if the day and month are swapped
3336	if months.get(parts[0][:3]):
3337	try:
3338	day = int(parts[1])
3339	except ValueError:
3340	return None
3341	else:
3342	parts[1] = parts[0]
3343	else:
3344	return None
3345	month = months.get(parts[1][:3])
3346	if not month:
3347	return None
3348	try:
3349	year = int(parts[2])
3350	except ValueError:
3351	return None
3352	# Normalize two-digit years:
3353	# Anything in the 90's is interpreted as 1990 and on
3354	# Anything 89 or less is interpreted as 2089 or before
3355	if len(parts[2]) <= 2:
3356	year += (1900, 2000)[year < 90]
3357	timeparts = parts[3].split(':')
3358	timeparts = timeparts + ([0] * (3 - len(timeparts)))
3359	try:
3360	(hour, minute, second) = map(int, timeparts)
3361	except ValueError:
3362	return None
3363	tzhour = 0
3364	tzmin = 0
3365	# Strip 'Etc/' from the timezone
3366	if parts[4].startswith('etc/'):
3367	parts[4] = parts[4][4:]
3368	# Normalize timezones that start with 'gmt':
3369	# GMT-05:00 => -0500
3370	# GMT => GMT
3371	if parts[4].startswith('gmt'):
3372	parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt'
3373	# Handle timezones like '-0500', '+0500', and 'EST'
3374	if parts[4] and parts[4][0] in ('-', '+'):
3375	try:
3376	tzhour = int(parts[4][1:3])
3377	tzmin = int(parts[4][3:])
3378	except ValueError:
3379	return None
3380	if parts[4].startswith('-'):
3381	tzhour = tzhour * -1
3382	tzmin = tzmin * -1
3383	else:
3384	tzhour = timezonenames.get(parts[4], 0)
3385	# Create the datetime object and timezone delta objects
3386	try:
3387	stamp = datetime.datetime(year, month, day, hour, minute, second)
3388	except ValueError:
3389	return None
3390	delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
3391	# Return the date and timestamp in a UTC 9-tuple
3392	try:
3393	return (stamp - delta).utctimetuple()
3394	except (OverflowError, ValueError):
3395	# IronPython throws ValueErrors instead of OverflowErrors
3396	return None
3397	registerDateHandler(_parse_date_rfc822)
3398
3399	_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
3400	'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
3401	def _parse_date_asctime(dt):
3402	"""Parse asctime-style dates.
3403
3404	Converts asctime to RFC822-compatible dates and uses the RFC822 parser
3405	to do the actual parsing.
3406
3407	Supported formats (format is standardized to the first one listed):
3408
3409	* {weekday name} {month name} dd hh:mm:ss {+-tz} yyyy
3410	* {weekday name} {month name} dd hh:mm:ss yyyy
3411	"""
3412
3413	parts = dt.split()
3414
3415	# Insert a GMT timezone, if needed.
3416	if len(parts) == 5:
3417	parts.insert(4, '+0000')
3418
3419	# Exit if there are not six parts.
3420	if len(parts) != 6:
3421	return None
3422
3423	# Reassemble the parts in an RFC822-compatible order and parse them.
3424	return _parse_date_rfc822(' '.join([
3425	parts[0], parts[2], parts[1], parts[5], parts[3], parts[4],
3426	]))
3427	registerDateHandler(_parse_date_asctime)
3428
3429	def _parse_date_perforce(aDateString):
3430	"""parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
3431	# Fri, 2006/09/15 08:19:53 EDT
3432	_my_date_pattern = re.compile( \
3433	r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
3434
3435	m = _my_date_pattern.search(aDateString)
3436	if m is None:
3437	return None
3438	dow, year, month, day, hour, minute, second, tz = m.groups()
3439	months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
3440	dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
3441	tm = rfc822.parsedate_tz(dateString)
3442	if tm:
3443	return time.gmtime(rfc822.mktime_tz(tm))
3444	registerDateHandler(_parse_date_perforce)
3445
3446	def _parse_date(dateString):
3447	'''Parses a variety of date formats into a 9-tuple in GMT'''
3448	if not dateString:
3449	return None
3450	for handler in _date_handlers:
3451	try:
3452	date9tuple = handler(dateString)
3453	except (KeyError, OverflowError, ValueError):
3454	continue
3455	if not date9tuple:
3456	continue
3457	if len(date9tuple) != 9:
3458	continue
3459	return date9tuple
3460	return None
3461
3462	# Each marker represents some of the characters of the opening XML
3463	# processing instruction ('<?xm') in the specified encoding.
3464	EBCDIC_MARKER = _l2bytes([0x4C, 0x6F, 0xA7, 0x94])
3465	UTF16BE_MARKER = _l2bytes([0x00, 0x3C, 0x00, 0x3F])
3466	UTF16LE_MARKER = _l2bytes([0x3C, 0x00, 0x3F, 0x00])
3467	UTF32BE_MARKER = _l2bytes([0x00, 0x00, 0x00, 0x3C])
3468	UTF32LE_MARKER = _l2bytes([0x3C, 0x00, 0x00, 0x00])
3469
3470	ZERO_BYTES = _l2bytes([0x00, 0x00])
3471
3472	# Match the opening XML declaration.
3473	# Example: <?xml version="1.0" encoding="utf-8"?>
3474	RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>')
3475
3476	# Capture the value of the XML processing instruction's encoding attribute.
3477	# Example: <?xml version="1.0" encoding="utf-8"?>
3478	RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.encoding=[\'"](.?)[\'"].*\?>'))
3479
3480	def convert_to_utf8(http_headers, data):
3481	'''Detect and convert the character encoding to UTF-8.
3482
3483	http_headers is a dictionary
3484	data is a raw string (not Unicode)'''
3485
3486	# This is so much trickier than it sounds, it's not even funny.
3487	# According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
3488	# is application/xml, application/*+xml,
3489	# application/xml-external-parsed-entity, or application/xml-dtd,
3490	# the encoding given in the charset parameter of the HTTP Content-Type
3491	# takes precedence over the encoding given in the XML prefix within the
3492	# document, and defaults to 'utf-8' if neither are specified. But, if
3493	# the HTTP Content-Type is text/xml, text/*+xml, or
3494	# text/xml-external-parsed-entity, the encoding given in the XML prefix
3495	# within the document is ALWAYS IGNORED and only the encoding given in
3496	# the charset parameter of the HTTP Content-Type header should be
3497	# respected, and it defaults to 'us-ascii' if not specified.
3498
3499	# Furthermore, discussion on the atom-syntax mailing list with the
3500	# author of RFC 3023 leads me to the conclusion that any document
3501	# served with a Content-Type of text/* and no charset parameter
3502	# must be treated as us-ascii. (We now do this.) And also that it
3503	# must always be flagged as non-well-formed. (We now do this too.)
3504
3505	# If Content-Type is unspecified (input was local file or non-HTTP source)
3506	# or unrecognized (server just got it totally wrong), then go by the
3507	# encoding given in the XML prefix of the document and default to
3508	# 'iso-8859-1' as per the HTTP specification (RFC 2616).
3509
3510	# Then, assuming we didn't find a character encoding in the HTTP headers
3511	# (and the HTTP Content-type allowed us to look in the body), we need
3512	# to sniff the first few bytes of the XML data and try to determine
3513	# whether the encoding is ASCII-compatible. Section F of the XML
3514	# specification shows the way here:
3515	# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3516
3517	# If the sniffed encoding is not ASCII-compatible, we need to make it
3518	# ASCII compatible so that we can sniff further into the XML declaration
3519	# to find the encoding attribute, which will tell us the true encoding.
3520
3521	# Of course, none of this guarantees that we will be able to parse the
3522	# feed in the declared character encoding (assuming it was declared
3523	# correctly, which many are not). iconv_codec can help a lot;
3524	# you should definitely install it if you can.
3525	# http://cjkpython.i18n.org/
3526
3527	bom_encoding = u''
3528	xml_encoding = u''
3529	rfc3023_encoding = u''
3530
3531	# Look at the first few bytes of the document to guess what
3532	# its encoding may be. We only need to decode enough of the
3533	# document that we can use an ASCII-compatible regular
3534	# expression to search for an XML encoding declaration.
3535	# The heuristic follows the XML specification, section F:
3536	# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3537	# Check for BOMs first.
3538	if data[:4] == codecs.BOM_UTF32_BE:
3539	bom_encoding = u'utf-32be'
3540	data = data[4:]
3541	elif data[:4] == codecs.BOM_UTF32_LE:
3542	bom_encoding = u'utf-32le'
3543	data = data[4:]
3544	elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
3545	bom_encoding = u'utf-16be'
3546	data = data[2:]
3547	elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
3548	bom_encoding = u'utf-16le'
3549	data = data[2:]
3550	elif data[:3] == codecs.BOM_UTF8:
3551	bom_encoding = u'utf-8'
3552	data = data[3:]
3553	# Check for the characters '<?xm' in several encodings.
3554	elif data[:4] == EBCDIC_MARKER:
3555	bom_encoding = u'cp037'
3556	elif data[:4] == UTF16BE_MARKER:
3557	bom_encoding = u'utf-16be'
3558	elif data[:4] == UTF16LE_MARKER:
3559	bom_encoding = u'utf-16le'
3560	elif data[:4] == UTF32BE_MARKER:
3561	bom_encoding = u'utf-32be'
3562	elif data[:4] == UTF32LE_MARKER:
3563	bom_encoding = u'utf-32le'
3564
3565	tempdata = data
3566	try:
3567	if bom_encoding:
3568	tempdata = data.decode(bom_encoding).encode('utf-8')
3569	except (UnicodeDecodeError, LookupError):
3570	# feedparser recognizes UTF-32 encodings that aren't
3571	# available in Python 2.4 and 2.5, so it's possible to
3572	# encounter a LookupError during decoding.
3573	xml_encoding_match = None
3574	else:
3575	xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
3576
3577	if xml_encoding_match:
3578	xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
3579	# Normalize the xml_encoding if necessary.
3580	if bom_encoding and (xml_encoding in (
3581	u'u16', u'utf-16', u'utf16', u'utf_16',
3582	u'u32', u'utf-32', u'utf32', u'utf_32',
3583	u'iso-10646-ucs-2', u'iso-10646-ucs-4',
3584	u'csucs4', u'csunicode', u'ucs-2', u'ucs-4'
3585	)):
3586	xml_encoding = bom_encoding
3587
3588	# Find the HTTP Content-Type and, hopefully, a character
3589	# encoding provided by the server. The Content-Type is used
3590	# to choose the "correct" encoding among the BOM encoding,
3591	# XML declaration encoding, and HTTP encoding, following the
3592	# heuristic defined in RFC 3023.
3593	http_content_type = http_headers.get('content-type') or ''
3594	http_content_type, params = cgi.parse_header(http_content_type)
3595	http_encoding = params.get('charset', '').replace("'", "")
3596	if not isinstance(http_encoding, unicode):
3597	http_encoding = http_encoding.decode('utf-8', 'ignore')
3598
3599	acceptable_content_type = 0
3600	application_content_types = (u'application/xml', u'application/xml-dtd',
3601	u'application/xml-external-parsed-entity')
3602	text_content_types = (u'text/xml', u'text/xml-external-parsed-entity')
3603	if (http_content_type in application_content_types) or \
3604	(http_content_type.startswith(u'application/') and
3605	http_content_type.endswith(u'+xml')):
3606	acceptable_content_type = 1
3607	rfc3023_encoding = http_encoding or xml_encoding or u'utf-8'
3608	elif (http_content_type in text_content_types) or \
3609	(http_content_type.startswith(u'text/') and
3610	http_content_type.endswith(u'+xml')):
3611	acceptable_content_type = 1
3612	rfc3023_encoding = http_encoding or u'us-ascii'
3613	elif http_content_type.startswith(u'text/'):
3614	rfc3023_encoding = http_encoding or u'us-ascii'
3615	elif http_headers and 'content-type' not in http_headers:
3616	rfc3023_encoding = xml_encoding or u'iso-8859-1'
3617	else:
3618	rfc3023_encoding = xml_encoding or u'utf-8'
3619	# gb18030 is a superset of gb2312, so always replace gb2312
3620	# with gb18030 for greater compatibility.
3621	if rfc3023_encoding.lower() == u'gb2312':
3622	rfc3023_encoding = u'gb18030'
3623	if xml_encoding.lower() == u'gb2312':
3624	xml_encoding = u'gb18030'
3625
3626	# there are four encodings to keep track of:
3627	# - http_encoding is the encoding declared in the Content-Type HTTP header
3628	# - xml_encoding is the encoding declared in the <?xml declaration
3629	# - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
3630	# - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
3631	error = None
3632
3633	if http_headers and (not acceptable_content_type):
3634	if 'content-type' in http_headers:
3635	msg = '%s is not an XML media type' % http_headers['content-type']
3636	else:
3637	msg = 'no Content-type specified'
3638	error = NonXMLContentType(msg)
3639
3640	# determine character encoding
3641	known_encoding = 0
3642	lazy_chardet_encoding = None
3643	tried_encodings = []
3644	if chardet:
3645	def lazy_chardet_encoding():
3646	chardet_encoding = chardet.detect(data)['encoding']
3647	if not chardet_encoding:
3648	chardet_encoding = ''
3649	if not isinstance(chardet_encoding, unicode):
3650	chardet_encoding = unicode(chardet_encoding, 'ascii', 'ignore')
3651	return chardet_encoding
3652	# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
3653	for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding,
3654	lazy_chardet_encoding, u'utf-8', u'windows-1252', u'iso-8859-2'):
3655	if callable(proposed_encoding):
3656	proposed_encoding = proposed_encoding()
3657	if not proposed_encoding:
3658	continue
3659	if proposed_encoding in tried_encodings:
3660	continue
3661	tried_encodings.append(proposed_encoding)
3662	try:
3663	data = data.decode(proposed_encoding)
3664	except (UnicodeDecodeError, LookupError):
3665	pass
3666	else:
3667	known_encoding = 1
3668	# Update the encoding in the opening XML processing instruction.
3669	new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
3670	if RE_XML_DECLARATION.search(data):
3671	data = RE_XML_DECLARATION.sub(new_declaration, data)
3672	else:
3673	data = new_declaration + u'\n' + data
3674	data = data.encode('utf-8')
3675	break
3676	# if still no luck, give up
3677	if not known_encoding:
3678	error = CharacterEncodingUnknown(
3679	'document encoding unknown, I tried ' +
3680	'%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
3681	(rfc3023_encoding, xml_encoding))
3682	rfc3023_encoding = u''
3683	elif proposed_encoding != rfc3023_encoding:
3684	error = CharacterEncodingOverride(
3685	'document declared as %s, but parsed as %s' %
3686	(rfc3023_encoding, proposed_encoding))
3687	rfc3023_encoding = proposed_encoding
3688
3689	return data, rfc3023_encoding, error
3690
3691	# Match XML entity declarations.
3692	# Example: <!ENTITY copyright "(C)">
3693	RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s<!ENTITY([^>]?)>'), re.MULTILINE)
3694
3695	# Match XML DOCTYPE declarations.
3696	# Example: <!DOCTYPE feed [ ]>
3697	RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s<!DOCTYPE([^>]?)>'), re.MULTILINE)
3698
3699	# Match safe entity declarations.
3700	# This will allow hexadecimal character references through,
3701	# as well as text, but not arbitrary nested entities.
3702	# Example: cubed "³"
3703	# Example: copyright "(C)"
3704	# Forbidden: explode1 "&explode2;&explode2;"
3705	RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;\|[^&"]*)"'))
3706
3707	def replace_doctype(data):
3708	'''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
3709
3710	rss_version may be 'rss091n' or None
3711	stripped_data is the same XML document with a replaced DOCTYPE
3712	'''
3713
3714	# Divide the document into two groups by finding the location
3715	# of the first element that doesn't begin with '<?' or '<!'.
3716	start = re.search(_s2bytes('<\w'), data)
3717	start = start and start.start() or -1
3718	head, data = data[:start+1], data[start+1:]
3719
3720	# Save and then remove all of the ENTITY declarations.
3721	entity_results = RE_ENTITY_PATTERN.findall(head)
3722	head = RE_ENTITY_PATTERN.sub(_s2bytes(''), head)
3723
3724	# Find the DOCTYPE declaration and check the feed type.
3725	doctype_results = RE_DOCTYPE_PATTERN.findall(head)
3726	doctype = doctype_results and doctype_results[0] or _s2bytes('')
3727	if _s2bytes('netscape') in doctype.lower():
3728	version = u'rss091n'
3729	else:
3730	version = None
3731
3732	# Re-insert the safe ENTITY declarations if a DOCTYPE was found.
3733	replacement = _s2bytes('')
3734	if len(doctype_results) == 1 and entity_results:
3735	match_safe_entities = lambda e: RE_SAFE_ENTITY_PATTERN.match(e)
3736	safe_entities = filter(match_safe_entities, entity_results)
3737	if safe_entities:
3738	replacement = _s2bytes('<!DOCTYPE feed [\n<!ENTITY') \
3739	+ _s2bytes('>\n<!ENTITY ').join(safe_entities) \
3740	+ _s2bytes('>\n]>')
3741	data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
3742
3743	# Precompute the safe entities for the loose parser.
3744	safe_entities = dict((k.decode('utf-8'), v.decode('utf-8'))
3745	for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement))
3746	return version, data, safe_entities
3747
3748
3749	# GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates'
3750	# items, or None in the case of a parsing error.
3751
3752	def _parse_poslist(value, geom_type, swap=True, dims=2):
3753	if geom_type == 'linestring':
3754	return _parse_georss_line(value, swap, dims)
3755	elif geom_type == 'polygon':
3756	ring = _parse_georss_line(value, swap, dims)
3757	return {'type': u'Polygon', 'coordinates': (ring['coordinates'],)}
3758	else:
3759	return None
3760
3761	def _gen_georss_coords(value, swap=True, dims=2):
3762	# A generator of (lon, lat) pairs from a string of encoded GeoRSS
3763	# coordinates. Converts to floats and swaps order.
3764	latlons = itertools.imap(float, value.strip().replace(',', ' ').split())
3765	nxt = latlons.next
3766	while True:
3767	t = [nxt(), nxt()][::swap and -1 or 1]
3768	if dims == 3:
3769	t.append(nxt())
3770	yield tuple(t)
3771
3772	def _parse_georss_point(value, swap=True, dims=2):
3773	# A point contains a single latitude-longitude pair, separated by
3774	# whitespace. We'll also handle comma separators.
3775	try:
3776	coords = list(_gen_georss_coords(value, swap, dims))
3777	return {u'type': u'Point', u'coordinates': coords[0]}
3778	except (IndexError, ValueError):
3779	return None
3780
3781	def _parse_georss_line(value, swap=True, dims=2):
3782	# A line contains a space separated list of latitude-longitude pairs in
3783	# WGS84 coordinate reference system, with each pair separated by
3784	# whitespace. There must be at least two pairs.
3785	try:
3786	coords = list(_gen_georss_coords(value, swap, dims))
3787	return {u'type': u'LineString', u'coordinates': coords}
3788	except (IndexError, ValueError):
3789	return None
3790
3791	def _parse_georss_polygon(value, swap=True, dims=2):
3792	# A polygon contains a space separated list of latitude-longitude pairs,
3793	# with each pair separated by whitespace. There must be at least four
3794	# pairs, with the last being identical to the first (so a polygon has a
3795	# minimum of three actual points).
3796	try:
3797	ring = list(_gen_georss_coords(value, swap, dims))
3798	except (IndexError, ValueError):
3799	return None
3800	if len(ring) < 4:
3801	return None
3802	return {u'type': u'Polygon', u'coordinates': (ring,)}
3803
3804	def _parse_georss_box(value, swap=True, dims=2):
3805	# A bounding box is a rectangular region, often used to define the extents
3806	# of a map or a rough area of interest. A box contains two space seperate
3807	# latitude-longitude pairs, with each pair separated by whitespace. The
3808	# first pair is the lower corner, the second is the upper corner.
3809	try:
3810	coords = list(_gen_georss_coords(value, swap, dims))
3811	return {u'type': u'Box', u'coordinates': tuple(coords)}
3812	except (IndexError, ValueError):
3813	return None
3814
3815	# end geospatial parsers
3816
3817
3818	def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
3819	'''Parse a feed from a URL, file, stream, or string.
3820
3821	request_headers, if given, is a dict from http header name to value to add
3822	to the request; this overrides internally generated values.
3823
3824	:return: A :class:`FeedParserDict`.
3825	'''
3826
3827	if handlers is None:
3828	handlers = []
3829	if request_headers is None:
3830	request_headers = {}
3831	if response_headers is None:
3832	response_headers = {}
3833
3834	result = FeedParserDict()
3835	result['feed'] = FeedParserDict()
3836	result['entries'] = []
3837	result['bozo'] = 0
3838	if not isinstance(handlers, list):
3839	handlers = [handlers]
3840	try:
3841	f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
3842	data = f.read()
3843	except Exception, e:
3844	result['bozo'] = 1
3845	result['bozo_exception'] = e
3846	data = None
3847	f = None
3848
3849	if hasattr(f, 'headers'):
3850	result['headers'] = dict(f.headers)
3851	# overwrite existing headers using response_headers
3852	if 'headers' in result:
3853	result['headers'].update(response_headers)
3854	elif response_headers:
3855	result['headers'] = copy.deepcopy(response_headers)
3856
3857	# lowercase all of the HTTP headers for comparisons per RFC 2616
3858	if 'headers' in result:
3859	http_headers = dict((k.lower(), v) for k, v in result['headers'].items())
3860	else:
3861	http_headers = {}
3862
3863	# if feed is gzip-compressed, decompress it
3864	if f and data and http_headers:
3865	if gzip and 'gzip' in http_headers.get('content-encoding', ''):
3866	try:
3867	data = gzip.GzipFile(fileobj=_StringIO(data)).read()
3868	except (IOError, struct.error), e:
3869	# IOError can occur if the gzip header is bad.
3870	# struct.error can occur if the data is damaged.
3871	result['bozo'] = 1
3872	result['bozo_exception'] = e
3873	if isinstance(e, struct.error):
3874	# A gzip header was found but the data is corrupt.
3875	# Ideally, we should re-request the feed without the
3876	# 'Accept-encoding: gzip' header, but we don't.
3877	data = None
3878	elif zlib and 'deflate' in http_headers.get('content-encoding', ''):
3879	try:
3880	data = zlib.decompress(data)
3881	except zlib.error, e:
3882	try:
3883	# The data may have no headers and no checksum.
3884	data = zlib.decompress(data, -15)
3885	except zlib.error, e:
3886	result['bozo'] = 1
3887	result['bozo_exception'] = e
3888
3889	# save HTTP headers
3890	if http_headers:
3891	if 'etag' in http_headers:
3892	etag = http_headers.get('etag', u'')
3893	if not isinstance(etag, unicode):
3894	etag = etag.decode('utf-8', 'ignore')
3895	if etag:
3896	result['etag'] = etag
3897	if 'last-modified' in http_headers:
3898	modified = http_headers.get('last-modified', u'')
3899	if modified:
3900	result['modified'] = modified
3901	result['modified_parsed'] = _parse_date(modified)
3902	if hasattr(f, 'url'):
3903	if not isinstance(f.url, unicode):
3904	result['href'] = f.url.decode('utf-8', 'ignore')
3905	else:
3906	result['href'] = f.url
3907	result['status'] = 200
3908	if hasattr(f, 'status'):
3909	result['status'] = f.status
3910	if hasattr(f, 'close'):
3911	f.close()
3912
3913	if data is None:
3914	return result
3915
3916	# Stop processing if the server sent HTTP 304 Not Modified.
3917	if getattr(f, 'code', 0) == 304:
3918	result['version'] = u''
3919	result['debug_message'] = 'The feed has not changed since you last checked, ' + \
3920	'so the server sent no data. This is a feature, not a bug!'
3921	return result
3922
3923	data, result['encoding'], error = convert_to_utf8(http_headers, data)
3924	use_strict_parser = result['encoding'] and True or False
3925	if error is not None:
3926	result['bozo'] = 1
3927	result['bozo_exception'] = error
3928
3929	result['version'], data, entities = replace_doctype(data)
3930
3931	# Ensure that baseuri is an absolute URI using an acceptable URI scheme.
3932	contentloc = http_headers.get('content-location', u'')
3933	href = result.get('href', u'')
3934	baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
3935
3936	baselang = http_headers.get('content-language', None)
3937	if not isinstance(baselang, unicode) and baselang is not None:
3938	baselang = baselang.decode('utf-8', 'ignore')
3939
3940	if not _XML_AVAILABLE:
3941	use_strict_parser = 0
3942	if use_strict_parser:
3943	# initialize the SAX parser
3944	feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
3945	saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
3946	saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
3947	try:
3948	# disable downloading external doctype references, if possible
3949	saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
3950	except xml.sax.SAXNotSupportedException:
3951	pass
3952	saxparser.setContentHandler(feedparser)
3953	saxparser.setErrorHandler(feedparser)
3954	source = xml.sax.xmlreader.InputSource()
3955	source.setByteStream(_StringIO(data))
3956	try:
3957	saxparser.parse(source)
3958	except xml.sax.SAXException, e:
3959	result['bozo'] = 1
3960	result['bozo_exception'] = feedparser.exc or e
3961	use_strict_parser = 0
3962	if not use_strict_parser and _SGML_AVAILABLE:
3963	feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
3964	feedparser.feed(data.decode('utf-8', 'replace'))
3965	result['feed'] = feedparser.feeddata
3966	result['entries'] = feedparser.entries
3967	result['version'] = result['version'] or feedparser.version
3968	result['namespaces'] = feedparser.namespacesInUse
3969	return result
3970
3971	# The list of EPSG codes for geographic (latitude/longitude) coordinate
3972	# systems to support decoding of GeoRSS GML profiles.
3973	_geogCS = [
3974	3819, 3821, 3824, 3889, 3906, 4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008,
3975	4009, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4018, 4019, 4020, 4021, 4022,
3976	4023, 4024, 4025, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036,
3977	4041, 4042, 4043, 4044, 4045, 4046, 4047, 4052, 4053, 4054, 4055, 4075, 4081,
3978	4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132,
3979	4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145,
3980	4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158,
3981	4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171,
3982	4172, 4173, 4174, 4175, 4176, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185,
3983	4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200,
3984	4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213,
3985	4214, 4215, 4216, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227,
3986	4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240,
3987	4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4252, 4253,
3988	4254, 4255, 4256, 4257, 4258, 4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266,
3989	4267, 4268, 4269, 4270, 4271, 4272, 4273, 4274, 4275, 4276, 4277, 4278, 4279,
3990	4280, 4281, 4282, 4283, 4284, 4285, 4286, 4287, 4288, 4289, 4291, 4292, 4293,
3991	4294, 4295, 4296, 4297, 4298, 4299, 4300, 4301, 4302, 4303, 4304, 4306, 4307,
3992	4308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4322,
3993	4324, 4326, 4463, 4470, 4475, 4483, 4490, 4555, 4558, 4600, 4601, 4602, 4603,
3994	4604, 4605, 4606, 4607, 4608, 4609, 4610, 4611, 4612, 4613, 4614, 4615, 4616,
3995	4617, 4618, 4619, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629,
3996	4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642,
3997	4643, 4644, 4645, 4646, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665,
3998	4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678,
3999	4679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691,
4000	4692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704,
4001	4705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717,
4002	4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4730,
4003	4731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743,
4004	4744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756,
4005	4757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4801, 4802, 4803, 4804,
4006	4805, 4806, 4807, 4808, 4809, 4810, 4811, 4813, 4814, 4815, 4816, 4817, 4818,
4007	4819, 4820, 4821, 4823, 4824, 4901, 4902, 4903, 4904, 4979 ]

Note: See TracBrowser for help on using the repository browser.

Download in other formats: