Mercurial > thp2epub
diff thp2epub.py @ 0:c6103c5987da draft default tip
Hello Gensokyo!
author | Emmanuel Gil Peyrot <linkmauve@linkmauve.fr> |
---|---|
date | Fri, 24 Aug 2012 14:48:18 +0200 |
parents | |
children |
line wrap: on
line diff
new file mode 100755 --- /dev/null +++ b/thp2epub.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python +# -*- encoding: UTF-8 -*- +## +## Copyright © 2012 Emmanuel Gil Peyrot <linkmauve@linkmauve.fr> +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published +## by the Free Software Foundation; version 3 only. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## + + +from __future__ import unicode_literals + +from lxml import etree +from lxml.builder import E +from time import strptime, strftime +import epub +from argparse import ArgumentParser + +try: + from urllib.request import urlopen + from urllib.error import HTTPError +except ImportError: + from urllib2 import urlopen + from urllib2 import HTTPError + + +images_list = [] +mime_type = { + '.jpg': 'image/jpeg', + 'jpeg': 'image/jpeg', + '.png': 'image/png', + '.gif': 'image/gif', + '.svg': 'image/svg+xml' +} + + +class Thread(object): + def __init__(self, op, replies): + self.op = op + self.replies = replies + self.title = op.title + self.author = op.author + + def render(self, only_op): + html = E.html( + E.head( + E.title(self.title), + E.link(href='story.css', type='text/css', rel='stylesheet') #TODO: convert it to a PI. + ), + E.body( + E.h1(self.title) + ), + xmlns='http://www.w3.org/1999/xhtml' + ) + + body = html.find('body') + body.append(self.op.render(display_title=False)) + + for reply in self.replies: + # Remove user answers if not wanted. + if only_op and not reply.is_op(self.op): + continue + + body.append(reply.render()) + + return html + + +class Post(object): + def __init__(self, title, author, date, image, content): + self.title = title + self.author = author + self.date = date + self.image = image + self.content = content + + def is_op(self, op): + return self.author.trip == op.author.trip + + def render(self, display_title=True): + if display_title: + title = E.h2(self.title) if self.title and display_title else E.h2('⁂') + else: + title = '' + + img = self.image.render() if self.image else '' + + p = E.p() + for item in self.content: + #TODO: remove useless attributes like onclick. + p.append(item) + + article = E.article( + title, + E.footer( + E.cite(self.author.render()), + ' ', + E.time(strftime('%y/%m/%d(%a)%H:%M', self.date), time=strftime('%y-%m-%dT%H:%M:%SZ', self.date)) + ), + img, + p + ) + return article + + +class Image(object): + def __init__(self, name, filesize, size, url): + self.name = name + self.filesize = filesize + self.size = size + self.url = url + + def render(self): + try: + url_file = urlopen(self.url) + except HTTPError: + return '' + with open(self.name, 'wb') as out: + out.write(url_file.read()) + images_list.append(self.name) + return E.img(src=self.name, alt=self.name) + + +class Author(object): + def __init__(self, name, trip, mail): + self.name = name + self.trip = trip + self.mail = mail + + def render(self): + return '{}{}'.format(self.name, self.trip) + + +def parse_post(root): + # We use the filesize element because it contains the image name. + label = root.find('label') + filesize = root.find('span[@class="filesize"]') + if filesize is not None: + a = filesize.getnext().getnext() + filesize = etree.tostring(filesize, method='text', encoding='UTF-8') + filesize = filesize.split() + name = filesize[7:-2] + name = b' '.join(name) + try: + name = name.decode('UTF-8') + except UnicodeDecodeError: + for i in range(-5, -42, -1): + char = name[i] if type(name[i]) is int else ord(name[i]) + if char & 0xc0 == 0xc0: + name = name[:i-1] + b'\xef\xbf\xbd' + name[-4:] + break + name = name.decode('UTF-8') + filesize, size = filesize[3][1:], filesize[5] + if a.tag == 'a': + url = a.get('href') + else: + url = a.find('img').get('src') + + image = Image(name, filesize, size, url) + else: + image = None + + label = root.find('label') + title = label.find('span[@class="filetitle"]') + title = title.text.strip() if title is not None else None + name = label.find('span[@class="postername"]') + if name is not None: + last = name + a = name.find('a') + if a is not None: + mail = a.get('href') + name = a.text + else: + mail = '' + name = name.text + else: + mail = '' + name = '' + + postertrip = label.find('span[@class="postertrip"]') + if postertrip is not None: + trip = postertrip.text + last = postertrip + else: + trip = '' + + author = Author(name, trip, mail) + + date = strptime(last.tail.strip(), '%y/%m/%d(%a)%H:%M') + + blockquote = root.find('blockquote') + content = [] + for item in blockquote: + if item is str and item.strip() == '': + continue + content.append(item) + + return Post(title, author, date, image, content) + + +def parse_thread(url): + tree = etree.parse(url, etree.HTMLParser()) + + root = tree.find('//form[@id="delform"]') + if root is None: + root = tree.find('//body') + op = parse_post(root) + + replies = [] + td = root.findall('.//td[@class="reply"]') + for reply in td: + replies.append(parse_post(reply)) + + return Thread(op, replies) + + +def main(url, forum, only_op, threads): + threads_list = [] + for thread in threads: + print('Rendering of thread №{}…'.format(thread)) + t = parse_thread(url.format(forum, thread)) + threads_list.append(t) + + html = t.render(only_op) + + # Use b mode as it allows us to directly dump UTF-8 data. + with open('{}.xhtml'.format(thread), 'wb') as f: + f.write(etree.tostring(html, pretty_print=True, xml_declaration=True, doctype='<!DOCTYPE html SYSTEM "/tmp/test.dtd">', encoding='UTF-8')) + + with epub.open('story.epub', 'w') as book: + t = threads_list[0] + + book.opf.metadata.add_title(t.title) + book.opf.metadata.add_creator(t.author.render()) + book.opf.metadata.add_date(strftime('%y-%m-%dT%H:%M:%SZ')) + book.opf.metadata.add_language('en') + + for thread in threads: + filename = '{}.xhtml'.format(thread) + manifest_item = epub.opf.ManifestItem(identifier='thread_{}'.format(thread), + href=filename, + media_type='application/xhtml+xml') + book.add_item(filename, manifest_item, True) + + for image in images_list: + extension = image[-4:] + manifest_item = epub.opf.ManifestItem(identifier='image_{}'.format(image), + href=image, + media_type=mime_type[extension]) + book.add_item(image, manifest_item, True) + + manifest_item = epub.opf.ManifestItem(identifier='style', + href='story.css', + media_type='text/css') + book.add_item('story.css', manifest_item) + + book.toc.title = t.title + nav_map = book.toc.nav_map + for thread in threads: + nav_point = epub.ncx.NavPoint() + nav_point.identifier = 'thread_%d' % thread + nav_point.add_label('Thread №%d' % thread) + nav_point.src = '%d.xhtml' % thread + nav_map.nav_point.append(nav_point) + + +if __name__ == '__main__': + parser = ArgumentParser(description='Download and convert THP stories.') + + parser.add_argument('threads', metavar='THREADS', nargs='+', type=int, help='List of the threads of the story.') + parser.add_argument('-u', '--url', metavar='URL', default='http://www.touhou-project.com/{}/res/{}.html', help='URL pattern from which the story will be downloaded, with the first {} as the forum, and the second as the thread.') + parser.add_argument('-f', '--forum', metavar='FORUM', default='sdm', help='The name of the forum (example: sdm, th, etc.).') + parser.add_argument('-o', '--only-op', action='store_true', help='Include only posts made by the original poster.') + + args = parser.parse_args() + + main(args.url, args.forum, args.only_op, args.threads)