annotate thp2epub.py @ 0:c6103c5987da draft default tip

Hello Gensokyo!
author Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
date Fri, 24 Aug 2012 14:48:18 +0200
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
1 #!/usr/bin/env python
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
2 # -*- encoding: UTF-8 -*-
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
3 ##
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
4 ## Copyright © 2012 Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
5 ##
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
6 ## This program is free software; you can redistribute it and/or modify
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
7 ## it under the terms of the GNU General Public License as published
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
8 ## by the Free Software Foundation; version 3 only.
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
9 ##
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
10 ## This program is distributed in the hope that it will be useful,
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
11 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
13 ## GNU General Public License for more details.
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
14 ##
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
15
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
16
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
17 from __future__ import unicode_literals
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
18
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
19 from lxml import etree
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
20 from lxml.builder import E
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
21 from time import strptime, strftime
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
22 import epub
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
23 from argparse import ArgumentParser
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
24
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
25 try:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
26 from urllib.request import urlopen
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
27 from urllib.error import HTTPError
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
28 except ImportError:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
29 from urllib2 import urlopen
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
30 from urllib2 import HTTPError
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
31
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
32
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
33 images_list = []
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
34 mime_type = {
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
35 '.jpg': 'image/jpeg',
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
36 'jpeg': 'image/jpeg',
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
37 '.png': 'image/png',
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
38 '.gif': 'image/gif',
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
39 '.svg': 'image/svg+xml'
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
40 }
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
41
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
42
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
43 class Thread(object):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
44 def __init__(self, op, replies):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
45 self.op = op
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
46 self.replies = replies
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
47 self.title = op.title
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
48 self.author = op.author
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
49
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
50 def render(self, only_op):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
51 html = E.html(
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
52 E.head(
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
53 E.title(self.title),
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
54 E.link(href='story.css', type='text/css', rel='stylesheet') #TODO: convert it to a PI.
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
55 ),
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
56 E.body(
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
57 E.h1(self.title)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
58 ),
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
59 xmlns='http://www.w3.org/1999/xhtml'
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
60 )
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
61
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
62 body = html.find('body')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
63 body.append(self.op.render(display_title=False))
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
64
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
65 for reply in self.replies:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
66 # Remove user answers if not wanted.
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
67 if only_op and not reply.is_op(self.op):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
68 continue
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
69
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
70 body.append(reply.render())
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
71
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
72 return html
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
73
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
74
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
75 class Post(object):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
76 def __init__(self, title, author, date, image, content):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
77 self.title = title
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
78 self.author = author
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
79 self.date = date
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
80 self.image = image
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
81 self.content = content
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
82
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
83 def is_op(self, op):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
84 return self.author.trip == op.author.trip
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
85
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
86 def render(self, display_title=True):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
87 if display_title:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
88 title = E.h2(self.title) if self.title and display_title else E.h2('⁂')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
89 else:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
90 title = ''
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
91
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
92 img = self.image.render() if self.image else ''
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
93
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
94 p = E.p()
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
95 for item in self.content:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
96 #TODO: remove useless attributes like onclick.
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
97 p.append(item)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
98
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
99 article = E.article(
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
100 title,
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
101 E.footer(
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
102 E.cite(self.author.render()),
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
103 ' ',
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
104 E.time(strftime('%y/%m/%d(%a)%H:%M', self.date), time=strftime('%y-%m-%dT%H:%M:%SZ', self.date))
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
105 ),
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
106 img,
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
107 p
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
108 )
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
109 return article
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
110
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
111
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
112 class Image(object):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
113 def __init__(self, name, filesize, size, url):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
114 self.name = name
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
115 self.filesize = filesize
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
116 self.size = size
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
117 self.url = url
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
118
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
119 def render(self):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
120 try:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
121 url_file = urlopen(self.url)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
122 except HTTPError:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
123 return ''
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
124 with open(self.name, 'wb') as out:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
125 out.write(url_file.read())
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
126 images_list.append(self.name)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
127 return E.img(src=self.name, alt=self.name)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
128
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
129
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
130 class Author(object):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
131 def __init__(self, name, trip, mail):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
132 self.name = name
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
133 self.trip = trip
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
134 self.mail = mail
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
135
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
136 def render(self):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
137 return '{}{}'.format(self.name, self.trip)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
138
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
139
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
140 def parse_post(root):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
141 # We use the filesize element because it contains the image name.
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
142 label = root.find('label')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
143 filesize = root.find('span[@class="filesize"]')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
144 if filesize is not None:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
145 a = filesize.getnext().getnext()
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
146 filesize = etree.tostring(filesize, method='text', encoding='UTF-8')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
147 filesize = filesize.split()
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
148 name = filesize[7:-2]
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
149 name = b' '.join(name)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
150 try:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
151 name = name.decode('UTF-8')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
152 except UnicodeDecodeError:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
153 for i in range(-5, -42, -1):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
154 char = name[i] if type(name[i]) is int else ord(name[i])
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
155 if char & 0xc0 == 0xc0:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
156 name = name[:i-1] + b'\xef\xbf\xbd' + name[-4:]
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
157 break
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
158 name = name.decode('UTF-8')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
159 filesize, size = filesize[3][1:], filesize[5]
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
160 if a.tag == 'a':
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
161 url = a.get('href')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
162 else:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
163 url = a.find('img').get('src')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
164
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
165 image = Image(name, filesize, size, url)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
166 else:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
167 image = None
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
168
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
169 label = root.find('label')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
170 title = label.find('span[@class="filetitle"]')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
171 title = title.text.strip() if title is not None else None
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
172 name = label.find('span[@class="postername"]')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
173 if name is not None:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
174 last = name
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
175 a = name.find('a')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
176 if a is not None:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
177 mail = a.get('href')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
178 name = a.text
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
179 else:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
180 mail = ''
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
181 name = name.text
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
182 else:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
183 mail = ''
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
184 name = ''
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
185
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
186 postertrip = label.find('span[@class="postertrip"]')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
187 if postertrip is not None:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
188 trip = postertrip.text
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
189 last = postertrip
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
190 else:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
191 trip = ''
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
192
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
193 author = Author(name, trip, mail)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
194
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
195 date = strptime(last.tail.strip(), '%y/%m/%d(%a)%H:%M')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
196
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
197 blockquote = root.find('blockquote')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
198 content = []
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
199 for item in blockquote:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
200 if item is str and item.strip() == '':
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
201 continue
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
202 content.append(item)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
203
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
204 return Post(title, author, date, image, content)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
205
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
206
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
207 def parse_thread(url):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
208 tree = etree.parse(url, etree.HTMLParser())
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
209
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
210 root = tree.find('//form[@id="delform"]')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
211 if root is None:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
212 root = tree.find('//body')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
213 op = parse_post(root)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
214
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
215 replies = []
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
216 td = root.findall('.//td[@class="reply"]')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
217 for reply in td:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
218 replies.append(parse_post(reply))
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
219
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
220 return Thread(op, replies)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
221
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
222
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
223 def main(url, forum, only_op, threads):
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
224 threads_list = []
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
225 for thread in threads:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
226 print('Rendering of thread №{}…'.format(thread))
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
227 t = parse_thread(url.format(forum, thread))
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
228 threads_list.append(t)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
229
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
230 html = t.render(only_op)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
231
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
232 # Use b mode as it allows us to directly dump UTF-8 data.
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
233 with open('{}.xhtml'.format(thread), 'wb') as f:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
234 f.write(etree.tostring(html, pretty_print=True, xml_declaration=True, doctype='<!DOCTYPE html SYSTEM "/tmp/test.dtd">', encoding='UTF-8'))
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
235
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
236 with epub.open('story.epub', 'w') as book:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
237 t = threads_list[0]
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
238
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
239 book.opf.metadata.add_title(t.title)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
240 book.opf.metadata.add_creator(t.author.render())
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
241 book.opf.metadata.add_date(strftime('%y-%m-%dT%H:%M:%SZ'))
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
242 book.opf.metadata.add_language('en')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
243
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
244 for thread in threads:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
245 filename = '{}.xhtml'.format(thread)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
246 manifest_item = epub.opf.ManifestItem(identifier='thread_{}'.format(thread),
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
247 href=filename,
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
248 media_type='application/xhtml+xml')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
249 book.add_item(filename, manifest_item, True)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
250
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
251 for image in images_list:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
252 extension = image[-4:]
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
253 manifest_item = epub.opf.ManifestItem(identifier='image_{}'.format(image),
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
254 href=image,
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
255 media_type=mime_type[extension])
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
256 book.add_item(image, manifest_item, True)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
257
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
258 manifest_item = epub.opf.ManifestItem(identifier='style',
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
259 href='story.css',
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
260 media_type='text/css')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
261 book.add_item('story.css', manifest_item)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
262
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
263 book.toc.title = t.title
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
264 nav_map = book.toc.nav_map
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
265 for thread in threads:
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
266 nav_point = epub.ncx.NavPoint()
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
267 nav_point.identifier = 'thread_%d' % thread
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
268 nav_point.add_label('Thread №%d' % thread)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
269 nav_point.src = '%d.xhtml' % thread
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
270 nav_map.nav_point.append(nav_point)
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
271
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
272
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
273 if __name__ == '__main__':
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
274 parser = ArgumentParser(description='Download and convert THP stories.')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
275
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
276 parser.add_argument('threads', metavar='THREADS', nargs='+', type=int, help='List of the threads of the story.')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
277 parser.add_argument('-u', '--url', metavar='URL', default='http://www.touhou-project.com/{}/res/{}.html', help='URL pattern from which the story will be downloaded, with the first {} as the forum, and the second as the thread.')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
278 parser.add_argument('-f', '--forum', metavar='FORUM', default='sdm', help='The name of the forum (example: sdm, th, etc.).')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
279 parser.add_argument('-o', '--only-op', action='store_true', help='Include only posts made by the original poster.')
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
280
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
281 args = parser.parse_args()
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
282
c6103c5987da Hello Gensokyo!
Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
parents:
diff changeset
283 main(args.url, args.forum, args.only_op, args.threads)