Mercurial > thp2epub
comparison thp2epub.py @ 0:c6103c5987da draft default tip
Hello Gensokyo!
author | Emmanuel Gil Peyrot <linkmauve@linkmauve.fr> |
---|---|
date | Fri, 24 Aug 2012 14:48:18 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c6103c5987da |
---|---|
1 #!/usr/bin/env python | |
2 # -*- encoding: UTF-8 -*- | |
3 ## | |
4 ## Copyright © 2012 Emmanuel Gil Peyrot <linkmauve@linkmauve.fr> | |
5 ## | |
6 ## This program is free software; you can redistribute it and/or modify | |
7 ## it under the terms of the GNU General Public License as published | |
8 ## by the Free Software Foundation; version 3 only. | |
9 ## | |
10 ## This program is distributed in the hope that it will be useful, | |
11 ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 ## GNU General Public License for more details. | |
14 ## | |
15 | |
16 | |
17 from __future__ import unicode_literals | |
18 | |
19 from lxml import etree | |
20 from lxml.builder import E | |
21 from time import strptime, strftime | |
22 import epub | |
23 from argparse import ArgumentParser | |
24 | |
25 try: | |
26 from urllib.request import urlopen | |
27 from urllib.error import HTTPError | |
28 except ImportError: | |
29 from urllib2 import urlopen | |
30 from urllib2 import HTTPError | |
31 | |
32 | |
33 images_list = [] | |
34 mime_type = { | |
35 '.jpg': 'image/jpeg', | |
36 'jpeg': 'image/jpeg', | |
37 '.png': 'image/png', | |
38 '.gif': 'image/gif', | |
39 '.svg': 'image/svg+xml' | |
40 } | |
41 | |
42 | |
43 class Thread(object): | |
44 def __init__(self, op, replies): | |
45 self.op = op | |
46 self.replies = replies | |
47 self.title = op.title | |
48 self.author = op.author | |
49 | |
50 def render(self, only_op): | |
51 html = E.html( | |
52 E.head( | |
53 E.title(self.title), | |
54 E.link(href='story.css', type='text/css', rel='stylesheet') #TODO: convert it to a PI. | |
55 ), | |
56 E.body( | |
57 E.h1(self.title) | |
58 ), | |
59 xmlns='http://www.w3.org/1999/xhtml' | |
60 ) | |
61 | |
62 body = html.find('body') | |
63 body.append(self.op.render(display_title=False)) | |
64 | |
65 for reply in self.replies: | |
66 # Remove user answers if not wanted. | |
67 if only_op and not reply.is_op(self.op): | |
68 continue | |
69 | |
70 body.append(reply.render()) | |
71 | |
72 return html | |
73 | |
74 | |
75 class Post(object): | |
76 def __init__(self, title, author, date, image, content): | |
77 self.title = title | |
78 self.author = author | |
79 self.date = date | |
80 self.image = image | |
81 self.content = content | |
82 | |
83 def is_op(self, op): | |
84 return self.author.trip == op.author.trip | |
85 | |
86 def render(self, display_title=True): | |
87 if display_title: | |
88 title = E.h2(self.title) if self.title and display_title else E.h2('⁂') | |
89 else: | |
90 title = '' | |
91 | |
92 img = self.image.render() if self.image else '' | |
93 | |
94 p = E.p() | |
95 for item in self.content: | |
96 #TODO: remove useless attributes like onclick. | |
97 p.append(item) | |
98 | |
99 article = E.article( | |
100 title, | |
101 E.footer( | |
102 E.cite(self.author.render()), | |
103 ' ', | |
104 E.time(strftime('%y/%m/%d(%a)%H:%M', self.date), time=strftime('%y-%m-%dT%H:%M:%SZ', self.date)) | |
105 ), | |
106 img, | |
107 p | |
108 ) | |
109 return article | |
110 | |
111 | |
112 class Image(object): | |
113 def __init__(self, name, filesize, size, url): | |
114 self.name = name | |
115 self.filesize = filesize | |
116 self.size = size | |
117 self.url = url | |
118 | |
119 def render(self): | |
120 try: | |
121 url_file = urlopen(self.url) | |
122 except HTTPError: | |
123 return '' | |
124 with open(self.name, 'wb') as out: | |
125 out.write(url_file.read()) | |
126 images_list.append(self.name) | |
127 return E.img(src=self.name, alt=self.name) | |
128 | |
129 | |
130 class Author(object): | |
131 def __init__(self, name, trip, mail): | |
132 self.name = name | |
133 self.trip = trip | |
134 self.mail = mail | |
135 | |
136 def render(self): | |
137 return '{}{}'.format(self.name, self.trip) | |
138 | |
139 | |
140 def parse_post(root): | |
141 # We use the filesize element because it contains the image name. | |
142 label = root.find('label') | |
143 filesize = root.find('span[@class="filesize"]') | |
144 if filesize is not None: | |
145 a = filesize.getnext().getnext() | |
146 filesize = etree.tostring(filesize, method='text', encoding='UTF-8') | |
147 filesize = filesize.split() | |
148 name = filesize[7:-2] | |
149 name = b' '.join(name) | |
150 try: | |
151 name = name.decode('UTF-8') | |
152 except UnicodeDecodeError: | |
153 for i in range(-5, -42, -1): | |
154 char = name[i] if type(name[i]) is int else ord(name[i]) | |
155 if char & 0xc0 == 0xc0: | |
156 name = name[:i-1] + b'\xef\xbf\xbd' + name[-4:] | |
157 break | |
158 name = name.decode('UTF-8') | |
159 filesize, size = filesize[3][1:], filesize[5] | |
160 if a.tag == 'a': | |
161 url = a.get('href') | |
162 else: | |
163 url = a.find('img').get('src') | |
164 | |
165 image = Image(name, filesize, size, url) | |
166 else: | |
167 image = None | |
168 | |
169 label = root.find('label') | |
170 title = label.find('span[@class="filetitle"]') | |
171 title = title.text.strip() if title is not None else None | |
172 name = label.find('span[@class="postername"]') | |
173 if name is not None: | |
174 last = name | |
175 a = name.find('a') | |
176 if a is not None: | |
177 mail = a.get('href') | |
178 name = a.text | |
179 else: | |
180 mail = '' | |
181 name = name.text | |
182 else: | |
183 mail = '' | |
184 name = '' | |
185 | |
186 postertrip = label.find('span[@class="postertrip"]') | |
187 if postertrip is not None: | |
188 trip = postertrip.text | |
189 last = postertrip | |
190 else: | |
191 trip = '' | |
192 | |
193 author = Author(name, trip, mail) | |
194 | |
195 date = strptime(last.tail.strip(), '%y/%m/%d(%a)%H:%M') | |
196 | |
197 blockquote = root.find('blockquote') | |
198 content = [] | |
199 for item in blockquote: | |
200 if item is str and item.strip() == '': | |
201 continue | |
202 content.append(item) | |
203 | |
204 return Post(title, author, date, image, content) | |
205 | |
206 | |
207 def parse_thread(url): | |
208 tree = etree.parse(url, etree.HTMLParser()) | |
209 | |
210 root = tree.find('//form[@id="delform"]') | |
211 if root is None: | |
212 root = tree.find('//body') | |
213 op = parse_post(root) | |
214 | |
215 replies = [] | |
216 td = root.findall('.//td[@class="reply"]') | |
217 for reply in td: | |
218 replies.append(parse_post(reply)) | |
219 | |
220 return Thread(op, replies) | |
221 | |
222 | |
223 def main(url, forum, only_op, threads): | |
224 threads_list = [] | |
225 for thread in threads: | |
226 print('Rendering of thread №{}…'.format(thread)) | |
227 t = parse_thread(url.format(forum, thread)) | |
228 threads_list.append(t) | |
229 | |
230 html = t.render(only_op) | |
231 | |
232 # Use b mode as it allows us to directly dump UTF-8 data. | |
233 with open('{}.xhtml'.format(thread), 'wb') as f: | |
234 f.write(etree.tostring(html, pretty_print=True, xml_declaration=True, doctype='<!DOCTYPE html SYSTEM "/tmp/test.dtd">', encoding='UTF-8')) | |
235 | |
236 with epub.open('story.epub', 'w') as book: | |
237 t = threads_list[0] | |
238 | |
239 book.opf.metadata.add_title(t.title) | |
240 book.opf.metadata.add_creator(t.author.render()) | |
241 book.opf.metadata.add_date(strftime('%y-%m-%dT%H:%M:%SZ')) | |
242 book.opf.metadata.add_language('en') | |
243 | |
244 for thread in threads: | |
245 filename = '{}.xhtml'.format(thread) | |
246 manifest_item = epub.opf.ManifestItem(identifier='thread_{}'.format(thread), | |
247 href=filename, | |
248 media_type='application/xhtml+xml') | |
249 book.add_item(filename, manifest_item, True) | |
250 | |
251 for image in images_list: | |
252 extension = image[-4:] | |
253 manifest_item = epub.opf.ManifestItem(identifier='image_{}'.format(image), | |
254 href=image, | |
255 media_type=mime_type[extension]) | |
256 book.add_item(image, manifest_item, True) | |
257 | |
258 manifest_item = epub.opf.ManifestItem(identifier='style', | |
259 href='story.css', | |
260 media_type='text/css') | |
261 book.add_item('story.css', manifest_item) | |
262 | |
263 book.toc.title = t.title | |
264 nav_map = book.toc.nav_map | |
265 for thread in threads: | |
266 nav_point = epub.ncx.NavPoint() | |
267 nav_point.identifier = 'thread_%d' % thread | |
268 nav_point.add_label('Thread №%d' % thread) | |
269 nav_point.src = '%d.xhtml' % thread | |
270 nav_map.nav_point.append(nav_point) | |
271 | |
272 | |
273 if __name__ == '__main__': | |
274 parser = ArgumentParser(description='Download and convert THP stories.') | |
275 | |
276 parser.add_argument('threads', metavar='THREADS', nargs='+', type=int, help='List of the threads of the story.') | |
277 parser.add_argument('-u', '--url', metavar='URL', default='http://www.touhou-project.com/{}/res/{}.html', help='URL pattern from which the story will be downloaded, with the first {} as the forum, and the second as the thread.') | |
278 parser.add_argument('-f', '--forum', metavar='FORUM', default='sdm', help='The name of the forum (example: sdm, th, etc.).') | |
279 parser.add_argument('-o', '--only-op', action='store_true', help='Include only posts made by the original poster.') | |
280 | |
281 args = parser.parse_args() | |
282 | |
283 main(args.url, args.forum, args.only_op, args.threads) |