1 | """ |
---|
2 | Developed by Massimo Di Pierro |
---|
3 | Released under the web2py license (LGPL) |
---|
4 | |
---|
5 | What does it do? |
---|
6 | |
---|
7 | if html is a variable containing HTML text and urls in the text, when you call |
---|
8 | |
---|
9 | html = expend_html(html) |
---|
10 | |
---|
11 | it automatically converts the url to links but when possible it embeds the object being linked. |
---|
12 | In particular it can embed images, videos, audio files, documents (it uses the google code player), |
---|
13 | as well as pages to a oembed service. |
---|
14 | |
---|
15 | |
---|
16 | Google Doc Support |
---|
17 | ================== |
---|
18 | Microsoft Word (.DOC, .DOCX) |
---|
19 | Microsoft Excel (.XLS and .XLSX) |
---|
20 | Microsoft PowerPoint 2007 / 2010 (.PPTX) |
---|
21 | Apple Pages (.PAGES) |
---|
22 | Adobe PDF (.PDF) |
---|
23 | Adobe Illustrator (.AI) |
---|
24 | Adobe Photoshop (.PSD) |
---|
25 | Autodesk AutoCad (.DXF) |
---|
26 | Scalable Vector Graphics (.SVG) |
---|
27 | PostScript (.EPS, .PS) |
---|
28 | TrueType (.TTF) |
---|
29 | XML Paper Specification (.XPS) |
---|
30 | |
---|
31 | Oembed Support |
---|
32 | ============== |
---|
33 | flickr.com |
---|
34 | youtube.com |
---|
35 | hulu.com |
---|
36 | vimeo.com |
---|
37 | slideshare.net |
---|
38 | qik.com |
---|
39 | polleverywhere.com |
---|
40 | wordpress.com |
---|
41 | revision3.com |
---|
42 | viddler.com |
---|
43 | """ |
---|
44 | from __future__ import print_function |
---|
45 | from gluon._compat import FancyURLopener, urllib_quote |
---|
46 | |
---|
47 | import re |
---|
48 | import cgi |
---|
49 | import sys |
---|
50 | from json import loads |
---|
51 | import urllib |
---|
52 | import uuid |
---|
53 | try: |
---|
54 | from BeautifulSoup import BeautifulSoup, Comment |
---|
55 | have_soup = True |
---|
56 | except ImportError: |
---|
57 | have_soup = False |
---|
58 | |
---|
59 | regex_link = re.compile('https?://\S+') |
---|
60 | |
---|
61 | EMBED_MAPS = [ |
---|
62 | (re.compile('http://\S*?flickr.com/\S*'), |
---|
63 | 'http://www.flickr.com/services/oembed/'), |
---|
64 | (re.compile('http://\S*.youtu(\.be|be\.com)/watch\S*'), |
---|
65 | 'http://www.youtube.com/oembed'), |
---|
66 | (re.compile('http://www.hulu.com/watch/\S*'), |
---|
67 | 'http://www.hulu.com/api/oembed.json'), |
---|
68 | (re.compile('http://vimeo.com/\S*'), |
---|
69 | 'http://vimeo.com/api/oembed.json'), |
---|
70 | (re.compile('http://www.slideshare.net/[^\/]+/\S*'), |
---|
71 | 'http://www.slideshare.net/api/oembed/2'), |
---|
72 | (re.compile('http://qik.com/\S*'), |
---|
73 | 'http://qik.com/api/oembed.json'), |
---|
74 | (re.compile('http://www.polleverywhere.com/\w+/\S+'), |
---|
75 | 'http://www.polleverywhere.com/services/oembed/'), |
---|
76 | (re.compile('http://\S+.wordpress.com/\S+'), |
---|
77 | 'http://public-api.wordpress.com/oembed/'), |
---|
78 | (re.compile('http://*.revision3.com/\S+'), |
---|
79 | 'http://revision3.com/api/oembed/'), |
---|
80 | (re.compile('http://\S+.viddler.com/\S+'), |
---|
81 | 'http://lab.viddler.com/services/oembed/'), |
---|
82 | ] |
---|
83 | |
---|
84 | |
---|
85 | def image(url): |
---|
86 | return '<img src="%s" style="max-width:100%%"/>' % url |
---|
87 | |
---|
88 | |
---|
89 | def audio(url): |
---|
90 | return '<audio controls="controls" style="max-width:100%%"><source src="%s" /></audio>' % url |
---|
91 | |
---|
92 | |
---|
93 | def video(url): |
---|
94 | return '<video controls="controls" style="max-width:100%%"><source src="%s" /></video>' % url |
---|
95 | |
---|
96 | |
---|
97 | def googledoc_viewer(url): |
---|
98 | return '<iframe src="https://docs.google.com/viewer?url=%s&embedded=true" style="max-width:100%%"></iframe>' % urllib_quote(url) |
---|
99 | |
---|
100 | |
---|
101 | def web2py_component(url): |
---|
102 | code = str(uuid.uuid4()) |
---|
103 | return '<div id="%s"></div><script>\nweb2py_component("%s","%s");\n</script>' % (code, url, code) |
---|
104 | |
---|
105 | EXTENSION_MAPS = { |
---|
106 | 'png': image, |
---|
107 | 'gif': image, |
---|
108 | 'jpg': image, |
---|
109 | 'jpeg': image, |
---|
110 | 'wav': audio, |
---|
111 | 'ogg': audio, |
---|
112 | 'mp3': audio, |
---|
113 | 'mov': video, |
---|
114 | 'mpe': video, |
---|
115 | 'mp4': video, |
---|
116 | 'mpg': video, |
---|
117 | 'mpg2': video, |
---|
118 | 'mpeg': video, |
---|
119 | 'mpeg4': video, |
---|
120 | 'movie': video, |
---|
121 | 'wmv': video, |
---|
122 | 'load': web2py_component, |
---|
123 | 'pdf': googledoc_viewer, |
---|
124 | 'doc': googledoc_viewer, |
---|
125 | 'docx': googledoc_viewer, |
---|
126 | 'ppt': googledoc_viewer, |
---|
127 | 'pptx': googledoc_viewer, |
---|
128 | 'xls': googledoc_viewer, |
---|
129 | 'xlsx': googledoc_viewer, |
---|
130 | 'pages': googledoc_viewer, |
---|
131 | 'ai': googledoc_viewer, |
---|
132 | 'psd': googledoc_viewer, |
---|
133 | 'xdf': googledoc_viewer, |
---|
134 | 'svg': googledoc_viewer, |
---|
135 | 'ttf': googledoc_viewer, |
---|
136 | 'xps': googledoc_viewer, |
---|
137 | } |
---|
138 | |
---|
139 | |
---|
140 | class VimeoURLOpener(FancyURLopener): |
---|
141 | "Vimeo blocks the urllib user agent for some reason" |
---|
142 | version = "Mozilla/4.0" |
---|
143 | urllib._urlopener = VimeoURLOpener() |
---|
144 | |
---|
145 | |
---|
146 | def oembed(url): |
---|
147 | for k, v in EMBED_MAPS: |
---|
148 | if k.match(url): |
---|
149 | oembed = v + '?format=json&url=' + cgi.escape(url) |
---|
150 | try: |
---|
151 | data = urllib.urlopen(oembed).read() |
---|
152 | return loads(data) # json! |
---|
153 | except: |
---|
154 | pass |
---|
155 | return {} |
---|
156 | |
---|
157 | |
---|
158 | def extension(url): |
---|
159 | return url.split('?')[0].split('.')[-1].lower() |
---|
160 | |
---|
161 | |
---|
162 | def expand_one(url, cdict): |
---|
163 | # try ombed but first check in cache |
---|
164 | if '@' in url and not '://'in url: |
---|
165 | return '<a href="mailto:%s">%s</a>' % (url, url) |
---|
166 | if cdict and url in cdict: |
---|
167 | r = cdict[url] |
---|
168 | else: |
---|
169 | r = oembed(url) |
---|
170 | if isinstance(cdict, dict): |
---|
171 | cdict[url] = r |
---|
172 | # if oembed service |
---|
173 | if 'html' in r: |
---|
174 | html = r['html'].encode('utf8') |
---|
175 | if html.startswith('<object'): |
---|
176 | return '<embed style="max-width:100%%">%s</embed>' % html |
---|
177 | else: |
---|
178 | return html |
---|
179 | elif 'url' in r: |
---|
180 | url = r['url'].encode('utf8') |
---|
181 | # embed images, video, audio files |
---|
182 | ext = extension(url) |
---|
183 | if ext in EXTENSION_MAPS: |
---|
184 | return EXTENSION_MAPS[ext](url) |
---|
185 | # else regular link |
---|
186 | return '<a href="%(u)s">%(u)s</a>' % dict(u=url) |
---|
187 | |
---|
188 | |
---|
189 | def expand_html(html, cdict=None): |
---|
190 | if not have_soup: |
---|
191 | raise RuntimeError("Missing BeautifulSoup") |
---|
192 | soup = BeautifulSoup(html) |
---|
193 | comments = soup.findAll(text=lambda text: isinstance(text, Comment)) |
---|
194 | [comment.extract() for comment in comments] |
---|
195 | for txt in soup.findAll(text=True): |
---|
196 | if not txt.parent.name in ('a', 'script', 'pre', 'code', 'embed', 'object', 'audio', 'video'): |
---|
197 | ntxt = regex_link.sub( |
---|
198 | lambda match: expand_one(match.group(0), cdict), txt) |
---|
199 | txt.replaceWith(BeautifulSoup(ntxt)) |
---|
200 | return str(soup) |
---|
201 | |
---|
202 | |
---|
203 | def test(): |
---|
204 | example = """ |
---|
205 | <h3>Fringilla nisi parturient nullam</h3> |
---|
206 | <p>http://www.youtube.com/watch?v=IWBFiI5RrA0</p> |
---|
207 | <p>http://www.web2py.com/examples/static/images/logo_bw.png</p> |
---|
208 | <p>http://www.web2py.com/examples/default/index.load</p> |
---|
209 | <p>http://www.web2py.com/examples/static/web2py_manual_cutl.pdf</p> |
---|
210 | <p>Elementum sodales est varius magna leo sociis erat. Nascetur pretium non |
---|
211 | ultricies gravida. Condimentum at nascetur tempus. Porttitor viverra ipsum |
---|
212 | accumsan neque aliquet. Ultrices vestibulum tempor quisque eget sem eget. |
---|
213 | Ornare malesuada tempus dolor dolor magna consectetur. Nisl dui non curabitur |
---|
214 | laoreet tortor.</p> |
---|
215 | """ |
---|
216 | return expand_html(example) |
---|
217 | |
---|
218 | if __name__ == "__main__": |
---|
219 | if len(sys.argv) > 1: |
---|
220 | print(expand_html(open(sys.argv[1]).read())) |
---|
221 | else: |
---|
222 | print(test()) |
---|