view thp2epub.py @ 0:c6103c5987da draft default tip

Hello Gensokyo!
author Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
date Fri, 24 Aug 2012 14:48:18 +0200
parents
children
line wrap: on
line source

#!/usr/bin/env python
# -*- encoding: UTF-8 -*-
##
## Copyright © 2012 Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published
## by the Free Software Foundation; version 3 only.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##


from __future__ import unicode_literals

from lxml import etree
from lxml.builder import E
from time import strptime, strftime
import epub
from argparse import ArgumentParser

try:
    from urllib.request import urlopen
    from urllib.error import HTTPError
except ImportError:
    from urllib2 import urlopen
    from urllib2 import HTTPError


images_list = []
mime_type = {
    '.jpg': 'image/jpeg',
    'jpeg': 'image/jpeg',
    '.png': 'image/png',
    '.gif': 'image/gif',
    '.svg': 'image/svg+xml'
}


class Thread(object):
    def __init__(self, op, replies):
        self.op = op
        self.replies = replies
        self.title = op.title
        self.author = op.author

    def render(self, only_op):
        html = E.html(
            E.head(
                E.title(self.title),
                E.link(href='story.css', type='text/css', rel='stylesheet') #TODO: convert it to a PI.
            ),
            E.body(
                E.h1(self.title)
            ),
            xmlns='http://www.w3.org/1999/xhtml'
        )

        body = html.find('body')
        body.append(self.op.render(display_title=False))

        for reply in self.replies:
            # Remove user answers if not wanted.
            if only_op and not reply.is_op(self.op):
                continue

            body.append(reply.render())

        return html


class Post(object):
    def __init__(self, title, author, date, image, content):
        self.title = title
        self.author = author
        self.date = date
        self.image = image
        self.content = content

    def is_op(self, op):
        return self.author.trip == op.author.trip

    def render(self, display_title=True):
        if display_title:
            title = E.h2(self.title) if self.title and display_title else E.h2('⁂')
        else:
            title = ''

        img = self.image.render() if self.image else ''

        p = E.p()
        for item in self.content:
            #TODO: remove useless attributes like onclick.
            p.append(item)

        article = E.article(
            title,
            E.footer(
                E.cite(self.author.render()),
                ' ',
                E.time(strftime('%y/%m/%d(%a)%H:%M', self.date), time=strftime('%y-%m-%dT%H:%M:%SZ', self.date))
            ),
            img,
            p
        )
        return article


class Image(object):
    def __init__(self, name, filesize, size, url):
        self.name = name
        self.filesize = filesize
        self.size = size
        self.url = url

    def render(self):
        try:
            url_file = urlopen(self.url)
        except HTTPError:
            return ''
        with open(self.name, 'wb') as out:
            out.write(url_file.read())
        images_list.append(self.name)
        return E.img(src=self.name, alt=self.name)


class Author(object):
    def __init__(self, name, trip, mail):
        self.name = name
        self.trip = trip
        self.mail = mail

    def render(self):
        return '{}{}'.format(self.name, self.trip)


def parse_post(root):
    # We use the filesize element because it contains the image name.
    label = root.find('label')
    filesize = root.find('span[@class="filesize"]')
    if filesize is not None:
        a = filesize.getnext().getnext()
        filesize = etree.tostring(filesize, method='text', encoding='UTF-8')
        filesize = filesize.split()
        name = filesize[7:-2]
        name = b' '.join(name)
        try:
            name = name.decode('UTF-8')
        except UnicodeDecodeError:
            for i in range(-5, -42, -1):
                char = name[i] if type(name[i]) is int else ord(name[i])
                if char & 0xc0 == 0xc0:
                    name = name[:i-1] + b'\xef\xbf\xbd' + name[-4:]
                    break
            name = name.decode('UTF-8')
        filesize, size = filesize[3][1:], filesize[5]
        if a.tag == 'a':
            url = a.get('href')
        else:
            url = a.find('img').get('src')

        image = Image(name, filesize, size, url)
    else:
        image = None

    label = root.find('label')
    title = label.find('span[@class="filetitle"]')
    title = title.text.strip() if title is not None else None
    name = label.find('span[@class="postername"]')
    if name is not None:
        last = name
        a = name.find('a')
        if a is not None:
            mail = a.get('href')
            name = a.text
        else:
            mail = ''
            name = name.text
    else:
        mail = ''
        name = ''

    postertrip = label.find('span[@class="postertrip"]')
    if postertrip is not None:
        trip = postertrip.text
        last = postertrip
    else:
        trip = ''

    author = Author(name, trip, mail)

    date = strptime(last.tail.strip(), '%y/%m/%d(%a)%H:%M')

    blockquote = root.find('blockquote')
    content = []
    for item in blockquote:
        if item is str and item.strip() == '':
            continue
        content.append(item)

    return Post(title, author, date, image, content)


def parse_thread(url):
    tree = etree.parse(url, etree.HTMLParser())

    root = tree.find('//form[@id="delform"]')
    if root is None:
        root = tree.find('//body')
    op = parse_post(root)

    replies = []
    td = root.findall('.//td[@class="reply"]')
    for reply in td:
        replies.append(parse_post(reply))

    return Thread(op, replies)


def main(url, forum, only_op, threads):
    threads_list = []
    for thread in threads:
        print('Rendering of thread №{}…'.format(thread))
        t = parse_thread(url.format(forum, thread))
        threads_list.append(t)

        html = t.render(only_op)

        # Use b mode as it allows us to directly dump UTF-8 data.
        with open('{}.xhtml'.format(thread), 'wb') as f:
            f.write(etree.tostring(html, pretty_print=True, xml_declaration=True, doctype='<!DOCTYPE html SYSTEM "/tmp/test.dtd">', encoding='UTF-8'))

    with epub.open('story.epub', 'w') as book:
        t = threads_list[0]

        book.opf.metadata.add_title(t.title)
        book.opf.metadata.add_creator(t.author.render())
        book.opf.metadata.add_date(strftime('%y-%m-%dT%H:%M:%SZ'))
        book.opf.metadata.add_language('en')

        for thread in threads:
            filename = '{}.xhtml'.format(thread)
            manifest_item = epub.opf.ManifestItem(identifier='thread_{}'.format(thread),
                                                  href=filename,
                                                  media_type='application/xhtml+xml')
            book.add_item(filename, manifest_item, True)

        for image in images_list:
            extension = image[-4:]
            manifest_item = epub.opf.ManifestItem(identifier='image_{}'.format(image),
                                                  href=image,
                                                  media_type=mime_type[extension])
            book.add_item(image, manifest_item, True)

        manifest_item = epub.opf.ManifestItem(identifier='style',
                                              href='story.css',
                                              media_type='text/css')
        book.add_item('story.css', manifest_item)

        book.toc.title = t.title
        nav_map = book.toc.nav_map
        for thread in threads:
            nav_point = epub.ncx.NavPoint()
            nav_point.identifier = 'thread_%d' % thread
            nav_point.add_label('Thread №%d' % thread)
            nav_point.src = '%d.xhtml' % thread
            nav_map.nav_point.append(nav_point)


if __name__ == '__main__':
    parser = ArgumentParser(description='Download and convert THP stories.')

    parser.add_argument('threads', metavar='THREADS', nargs='+', type=int, help='List of the threads of the story.')
    parser.add_argument('-u', '--url', metavar='URL', default='http://www.touhou-project.com/{}/res/{}.html', help='URL pattern from which the story will be downloaded, with the first {} as the forum, and the second as the thread.')
    parser.add_argument('-f', '--forum', metavar='FORUM', default='sdm', help='The name of the forum (example: sdm, th, etc.).')
    parser.add_argument('-o', '--only-op', action='store_true', help='Include only posts made by the original poster.')

    args = parser.parse_args()

    main(args.url, args.forum, args.only_op, args.threads)