For checkouts or to view logs direct your SVN client to svn://svn.saintamh.org/code/blog2booklet/blog2booklet.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
$Id: blog2booklet.py 3006 2016-04-01 23:27:06Z herve $
"""

#----------------------------------------------------------------------------------------------------------------------------------
# includes

# standards
from collections import namedtuple
from cStringIO import StringIO
from urlparse import urlparse
from datetime import datetime
from os import makedirs, symlink
from os.path import abspath, dirname, exists, join as pjoin
import re
from shutil import copy, rmtree
import subprocess
from sys import argv, exit, path as syspath, stderr
from tempfile import mkdtemp

# saintamh
from saintamh.cartesian import FloatDimension, IntDimension
from saintamh.http import HTTPError, HTTPStatus404, SimpleHTTPClient
from saintamh.levenshtein import levenshtein_distance
from saintamh.struct import absolute_http_url, dict_of, nullable, seq_of, struct
from saintamh.util.dates import days, imprecise_but_intuitive_timedelta_str
from saintamh.util.etree import build_etree, css_all, css_one, remove_all, remove_node, xpath_all, xpath_one, xpath_one_of
from saintamh.util.html import html_etree
from saintamh.util.iterables import dedup, merge_sorted_iterators, one
from saintamh.util.paths import here
from saintamh.util.regexes import capture_one, match_one
from saintamh.util.scrapers import extract_text, join_urls, make_all_urls_absolute
from saintamh.util.strings import normalize_spaces

# other apps
syspath.append (here (__file__, 'recess'))
from html2tex import HtmlToLatexConverter

# 3rd party libs
import lxml.etree as ET
from PIL import Image

#----------------------------------------------------------------------------------------------------------------------------------
# data structs

class BlogEntry (struct (
        url = nullable (absolute_http_url),
        datetime = nullable (datetime),
        title = nullable (unicode),
        author_name = nullable (unicode),
        body_el = ET._Element,
        extra_headers = nullable (dict_of(unicode,unicode)),
        )):

    def __cmp__ (self, other):
        return (
            cmp (self.datetime and str(self.datetime), other.datetime and str(other.datetime))
            or cmp (self.title, other.title)
            or cmp (
                ET.tostring (self.body_el),
                ET.tostring (other.body_el)
            )
        )

class Blog (object):

    id = NotImplemented
    html_encoding = 'UTF-8'
    datetime_strf = '%Y-%m-%d %H:%M'
    include_time_elapsed_btw_entries = True
    user_agent = None

    def __init__ (self, min_datetime=None, max_datetime=None):
        self.min_datetime = min_datetime
        self.max_datetime = max_datetime
        if self.id is NotImplemented:
            self.id = self.__class__.__name__
        self.http = SimpleHTTPClient (
            cache_path = pjoin (dirname(__file__), 'cache', self.id),
            courtesy_delay = 5,
            user_agent=self.user_agent,
        )

    def fetch_html (self, url, *rest, **kwargs):
        html_bytes = self.fetch_html_bytes (url, *rest, **kwargs)
        return make_all_urls_absolute (
            url,
            html_etree(html_bytes.decode(self.html_encoding))
        )

    def fetch_html_bytes (self, url, *rest, **kwargs):
        return self.http.get (url, *rest, **kwargs)

    def iter_entries (self):
        raise NotImplemented

class Booklet (struct (
        id = str,
        title = unicode,
        blogs = seq_of(Blog),
        )):
    pass

#----------------------------------------------------------------------------------------------------------------------------------

class DependencyManager(object):

    latex_max_dim = IntDimension (350,500)
        
    def __init__ (self, root_dir, rel_path, latex_constraints=False):
        self.root_dir = root_dir
        self.rel_path = rel_path
        self.latex_constraints = latex_constraints
        self.downloaded = {}
        if exists (pjoin (self.root_dir, self.rel_path)):
            raise "%r exists -- won't overwrite" % pjoin (self.root_dir, self.rel_path)
        makedirs (pjoin (self.root_dir, self.rel_path))

    def fetch_image (self, http, url):
        if url not in self.downloaded:
            try:
                img_bytes = http.get (url)
                img = Image.open(StringIO(img_bytes))
            except (HTTPError, IOError), err:
                print err
                new_url = url
            else:
                if self.latex_constraints and img.format not in ('JPEG', 'PNG'):
                    file_ext = 'png'
                else:
                    file_ext = img.format.lower()
                if self.latex_constraints:
                    orig_size = FloatDimension(img.size)
                    ratio = max (orig_size / self.latex_max_dim)
                    if ratio > 1:
                        img = img.resize (IntDimension(orig_size / ratio))
                rel_file_path = pjoin (
                    self.rel_path,
                    '%d.%s' % (
                        1 + len(self.downloaded),
                        file_ext
                    )
                )
                print "    > %s" % rel_file_path
                img.save (pjoin (self.root_dir, rel_file_path))
                new_url = rel_file_path
            self.downloaded[url] = new_url
        return self.downloaded[url]

    def fetch_all (self, http, entry_el, encoding):
        for img_el in entry_el.xpath ('.//img[@src]'):
            img_el.set ('src', self.fetch_image (http, img_el.get('src').encode(encoding)))

#----------------------------------------------------------------------------------------------------------------------------------

class LiveJournal (Blog):

    lj_username = NotImplemented

    def __init__ (self, *args, **kwargs):
        super(LiveJournal,self).__init__ (*args, **kwargs)
        self.index_url = 'http://%s.livejournal.com' % self.lj_username # nb no trailing slash

    def _fetch_and_filter_urls (self, url, re_url_path):
        return sorted (
            filter (
                re.compile (r'^%s/%s$' % (re.escape(self.index_url), re_url_path)).match,
                dedup (
                    join_urls (url, link_href)
                    for link_href in xpath_all (
                        '//a/@href',
                        self.fetch_html (url)
                    )
                )
            )
        )

    def _all_year_urls (self):
        return self._fetch_and_filter_urls (
            '%s/calendar' % self.index_url,
            r'\d\d\d\d/'
        )

    def _all_day_urls (self):
        for year_url in self._all_year_urls():
            _,year = match_one (r'/(\d\d\d\d)/$', year_url)
            for day_url in self._fetch_and_filter_urls (year_url, r'%s/\d\d/\d\d/' % year):
                yield day_url

    def iter_entries (self):
        for day_url in self._all_day_urls():
            date = datetime.strptime (
                match_one (r'/\d\d\d\d/\d\d/\d\d/$', day_url),
                '/%Y/%m/%d/'
            ).date()
            if (self.min_datetime and self.min_datetime.date() > date) \
                    or (self.max_datetime and self.max_datetime.date() < date):
                continue
            for entry in sorted (self._parse_entries_from_day_page(day_url,date)):
                remove_all (entry.body_el, './/img[contains(@src,"img/userinfo.gif")]')
                yield entry

    def _parse_entries_from_day_page (self, day_url):
        raise NotImplemented

class GiantLaser (LiveJournal):

    lj_username = 'giantlaser'

    def _parse_entries_from_day_page (self, day_url, date):
        for entry_el in xpath_all ('//div[@class="entry"]', self.fetch_html(day_url)):
            entry_url = join_urls (
                day_url,
                xpath_one ('.//a[@class="subjlink subj-link"]/@href', entry_el)
            )
            def header_text (label, allow_mismatch=False):
                header_el = xpath_one (
                    './/tr[./td[@class="dateheader"]/text()="%s:"]/td[@class="datecontent"]' % label,
                    entry_el,
                    allow_mismatch=allow_mismatch
                )
                return extract_text(header_el) if header_el is not None else None
            yield BlogEntry (
                url = entry_url,
                datetime = datetime.strptime (
                    header_text('Date'),
                    '%Y-%m-%d %H:%M'
                ),
                title = header_text ('Subject'),
                author_name = 'Tyler',
                extra_headers = {
                    k:v
                    for k,v in (
                        ('Music', header_text ('Music', allow_mismatch=True)),
                    )
                    if v
                },
                body_el = xpath_one ('.//div[@class="entry-text"]', entry_el),
            )

class SlowNewsDay (LiveJournal):

    lj_username = 'slownewsday'

    def _parse_entries_from_day_page (self, day_url, date):
        day_html = self.fetch_html (day_url)
        for entry_url in dedup (
                join_urls (day_url, re.sub (r'[\?\#].*', '', url))
                for url in xpath_all (
                        '//div[@style="text-align:right"]//a[contains(text(),"Comment")]/@href',
                        day_html
                )):
            if not re.match (r'^%s/\d+\.html$' % re.escape(self.index_url), entry_url):
                raise ValueError, repr(entry_url)
            entry_html = self.fetch_html(entry_url)
            extra_headers = {}
            meta_el = css_one ('ul.b-singlepost-meta', entry_html, allow_mismatch=True)
            if meta_el is not None:
                for row_el in xpath_all ('./li', meta_el):
                    key = re.sub (r':$', '', extract_text(remove_node(xpath_one('.//strong', row_el))))
                    val = extract_text (row_el)
                    extra_headers[key] = val
            title_el = css_one ('h1.b-singlepost-title', entry_html, allow_mismatch=True)
            if title_el is not None:
                entry_title = extract_text (title_el)
                remove_node (title_el)
            else:
                entry_title = None
            body_el = css_one ('article.b-singlepost-body', entry_html)
            while len(body_el) > 0 and not extract_text(body_el[0]):
                remove_node (body_el[0])
            yield BlogEntry (
                url = entry_url,
                datetime = datetime.strptime (
                    extract_text (css_one ('dd.b-singlepost-author-userinfo-screen time.published', entry_html)),
                    '%Y-%m-%d %H:%M:%S',
                ),
                author_name = 'Jayme',
                title = entry_title,
                body_el = body_el,
                extra_headers = extra_headers,
            )


class BBBB (Blog):

    user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'

    def iter_entries (self, **kwargs):
        url = "http://www.rezendi.com/travels/bbbb.html"
        body_el = xpath_one (
            '//table/tr/td[./*]',
            ET.HTML (
                re.sub (
                    # 2011-08-06 - LXML messes up the <b> tags, why?
                    r'<(/?)b>', lambda m: '<%sstrong>' % m.group(1),
                    re.sub (
                        r'<(/?)i>', '',
                        self.http.get(url).decode('Windows-1252')
                    )
                )
            )
        )
        body_el.tag = 'div'
        remove_node (body_el[0]) # redundant title
        belorussian_el = xpath_one (
            '//a[@href="http://webhostinggeeks.com/science/bbbb-be"]',
            body_el
        )
        belorussian_el.tail = ''
        while belorussian_el.tail == '' and belorussian_el.getnext().tag == 'br':
            remove_node (belorussian_el.getnext())
        remove_node (belorussian_el) # remove the tail text was well
        yield BlogEntry (
            url = url,
            title = "Prologue: Blood, Bullets, Bombs, and Bandwidth",
            body_el = body_el,
        )

#----------------------------------------------------------------------------------------------------------------------------------

class IdleWords (Blog):

    # MAN, what a mess it is to parse this site. Just for the record:
    #
    #  * unclosed HTML attributes/tags
    #  * dead links (intra-site)
    #  * date formats are inconsistent
    #  * the site's actually broken in many ways in the browser
    #  * quotes randomly backslash-escaped in a few posts
    #  * mixed encodings within one file, UTF-8 data that's been decoded as Latin1, then reencoded as UTF8
    #
    # Plus I chose to go to the Wayback Machine for posts that are unreachable because of funny characters in the URL (i.e. broken
    # links on his site) or that he's unpublished.

    id = 'idlewords'
    datetime_strf = '%B %d, %Y'
    include_time_elapsed_btw_entries = False

    PostSummary = namedtuple ('PostSummary', (
        'url',
        'title',
        'date',
        'summary_el',
    ))

    class UnpublishedEntry (ValueError):
        pass

    @staticmethod
    def try_to_fix_mixed_encodings (text):
        try:
            return (
                text
                # These two take care of UTF-8 bytes that have been, on the server side, read as Windows-1252, then served
                # encoded as UTF-8
                .decode('UTF-8')
                .encode('Windows-1252')
                # and these two are only here to throw an exception if the output isn't proper UTF-8
                .decode('UTF-8')
                .encode('UTF-8')
            )
        except UnicodeError, err:
            return text

    def fetch_html_bytes (self, url, *rest, **kwargs):
        html_bytes = super(IdleWords,self).fetch_html_bytes (url, *rest, **kwargs)
        html_bytes = re.sub (
            # His code to show the first few sentences of a post in the index cuts through tags. His HTML code is awful, overall.
            ur'<[^>]+\xE2\x80\xA6(?=</i>)',
            '',
            html_bytes,
        )
        html_bytes = re.sub (
            # Ugh, the quality of this guy's HTML seriously undermines his "Web dev frameworks are useless" argument
            r'(<a href=\"[^\">]+)>',
            r'\1">',
            html_bytes,
        )
        if len(re.findall(r'\\[\'\"]', html_bytes)) > 10:
            # see e.g. http://idlewords.com/2002/12/distributed_search_engines.htm
            html_bytes = re.sub (r'(?:(<[^>]+>)|\\([\'\"]))', lambda m: m.group(1) or m.group(2), html_bytes)
        html_bytes = re.sub (
            r'<div class="entrybox.+<table class="next_prev',
            lambda main_bytes: re.sub (
                r'(?<=[ -~])([^ -~]{2,8})(?=[ -~])',
                lambda suspicous_chars: self.try_to_fix_mixed_encodings(suspicous_chars.group()),
                main_bytes.group(),
            ),
            html_bytes,
            flags = re.S,
        )
        # for from_bytes,to_bytes in (
        #         # Proper encoding fixes proved too complicated, so I'm hand-picking them. Feels unsatisfactory, though
        #         ('\xc3\xa2\xe2\x82\xac\xe2\x80\x9c', '\xe2\x80\x93'),
        #         ('\xc3\xa2\xe2\x82\xac\xe2\x80\x9d', '\xe2\x80\x94'),
        #         ('\xc3\xa2\xe2\x82\xac\xe2\x84\xa2', '\xe2\x80\x99'),
        #         ('\xc3\xa2\xe2\x80\x9e\xc2\xa2', '\xe2\x84\xa2'),
        #         ('\xc3\x82\xc2\xa0', ' '),
        #         ('\xc3\x82 ', ' '),
        #         ('\xc3\x82\xc2\xa1', '\xc2\xa1'),
        #         ('\xc3\x83\xc2\xb3', '\xc3\xb3'),
        #         ('\xc3\x83\xc2\xa4', '\xc3\xa4'),
        #         ('\xc3\x83\xc2\xa8', '\xc3\xa8'),
        #         ):
        #     html_bytes = html_bytes.replace (from_bytes,to_bytes)
        return html_bytes

    def iter_summaries (self):
        home_url = 'http://idlewords.com/'
        index_html = self.fetch_html (home_url)
        archive_el = xpath_one ('//fieldset[contains(@class,"archive")]/table', index_html)
        for year_el in reversed (xpath_all ('./tr', archive_el)):
            year_str = extract_text(year_el[0])
            if not year_str:
                continue
            assert re.search (r'^20\d\d$', year_str), repr(year_str)
            year = int (year_str)
            if (self.min_datetime and year < self.min_datetime.year) \
                    or (self.max_datetime and year > self.max_datetime.year):
                continue
            for month_url in xpath_all ('.//a/@href', year_el):
                month = int (capture_one (r'^%s%d/(\d+)/$' % (re.escape(home_url),year), month_url))
                if (
                        (self.min_datetime and (year == self.min_datetime.year and month < self.min_datetime.month)) or
                        (self.max_datetime and (year == self.max_datetime.year and month > self.max_datetime.month))
                        ):
                    continue
                month_html = self.fetch_html(month_url)
                for summary_el in reversed (xpath_all (
                        './div[contains(@class,"entrybox")]',
                        xpath_one ('//div[@id="entries"]', month_html)
                        )):
                    try:
                        date_el = xpath_one ('.//b[./a[@class="date"]]', summary_el)
                    except Exception:
                        print ET.tostring(summary_el)
                        raise
                    date = datetime.strptime (extract_text(date_el), '%m.%d.%Y')
                    assert date.year == year and date.month == month, (year, month, date)
                    title_el = xpath_one ('./a[@class="post_title"]', summary_el)
                    url = xpath_one ('./@href', title_el)
                    if re.search (r'/\.html?$', url):
                        print "Skipped bad URL %r" % url
                        continue
                    yield self.PostSummary (
                        url = url,
                        title = extract_text(title_el),
                        date = date,
                        summary_el = summary_el,
                    )

    def fetch_entry_the_straightfwd_way (self, summary):
        entry_html = self.fetch_html (summary.url)
        entry_el = xpath_one ('//div[contains(@class,"entrybox")]', entry_html)
        entry_dt = datetime.strptime (
            extract_text (remove_node (entry_el[0])),
            '%m.%d.%Y',
        )
        if entry_dt.year < 1975:
            if re.search (r'\bunpublished\b', entry_el.get('class')):
                raise self.UnpublishedEntry()
            else:
                print ET.tostring(entry_el)
                raise ValueError(entry_dt)
        entry_link_el = remove_node (xpath_one ('.//a[@class="post_title"]', entry_el))
        entry_url = entry_link_el.get('href')
        while len(entry_el) > 0 and not (extract_text(entry_el[0]) or entry_el.xpath('.//img')):
            remove_node (entry_el[0])
        for bloat_el in entry_el.xpath ('.//div[contains(@style,"ddeffa")]'):
            if 'Antarctica' in extract_text(bloat_el):
                remove_node(bloat_el)
        return BlogEntry (
            url = entry_url,
            datetime = entry_dt,
            title = extract_text (entry_link_el),
            # We don't need the author name on every single article if it's always the same
            # author_name = u'Maciej Cegłowski',
            body_el = entry_el,
        )

    def drill_down (self, node):
        children_with_text = filter (extract_text, node)
        if len(children_with_text) == 1:
            return self.drill_down(children_with_text[0])
        else:
            return node

    def fetch_entry_from_wayback_machine (self, summary):
        archive_date = summary.date + days(365)
        archive_url = 'https://web.archive.org/web/%4d%02d000000/http://www.idlewords.com/weblog.%02d.%4d.html' % (
            archive_date.year + 1,
            archive_date.month,
            summary.date.month,
            summary.date.year,
        )
        archive_html = self.fetch_html (archive_url)
        archived_body_el = None
        if archive_date.year < 2004:
            all_entry_els = xpath_all ('//table[@width="390"]', archive_html)
        else:
            all_entry_els = css_all ('.entrybox', css_one ('#entries', archive_html))
        for entry_el in all_entry_els:
            body_el = self.drill_down(entry_el)
            date = self.extract_date_from_wayback_body_el (summary, body_el)
            text_match_score = self.wayback_body_el_text_match_score (summary, body_el)
            if abs(date-summary.date) < days(3) and text_match_score > 0.3:
                if archived_body_el is None:
                    archived_body_el = body_el
                else:
                    print archive_date
                    print ET.tostring (archived_body_el)
                    print '=' * 79
                    print date
                    print ET.tostring (body_el)
                    print '=' * 79
                    print summary
                    print ET.tostring (summary.summary_el)
                    raise ValueError ("Found two matching entries!")
        if archived_body_el is None:
            print "Couldn't find our entry (%r)" % (summary,)
            return None
        else:
            for time_el in archived_body_el.xpath('.//a[contains(@style,"767676")]'):
                if re.search (r'^[\dAPM\:\s]+$', extract_text(time_el)):
                    remove_node(time_el)
            return BlogEntry (
                url = summary.url,
                title = summary.title,
                datetime = summary.date,
                body_el = archived_body_el,
                extra_headers = {
                    "Scraper's note": "This entry was unpublished and had to be retrieved via the Internet Archive's Wayback Machine",
                },
            )

    def extract_date_from_wayback_body_el (self, summary, body_el):
        try:
            date_el = xpath_one_of (
                body_el,
                './b',
                './p/b[starts-with(text(),"%02d.")]' % summary.date.month,
                './span[contains(@style,"font-weight:bold")]',
                './p[contains(@style,"font-weight:bold")]',
                './/span[@class="date"]',
            )
            date_str = extract_text(date_el)
            date = datetime.strptime(
                date_str,
                # NB flipped day/month here, ugh:
                '%d.%m.%Y' if re.search(r'\d\d\d\d',date_str) else '%m.%d.%y',
            )
        except ValueError:
            print ET.tostring(body_el)
            raise
        else:
            remove_node(date_el)
            return date

    def wayback_body_el_text_match_score (self, summary, body_el):
        summary_text = extract_text (summary.summary_el)
        wayback_text = extract_text (body_el)
        l = min (len(summary_text), len(wayback_text))
        score = (l - levenshtein_distance(summary_text[:l], wayback_text[:l])) / float(l)
        #print (summary_text[:l], wayback_text[:l], l, levenshtein_distance(summary_text[:l], wayback_text[:l]), score)
        return score

    def iter_entries (self):
        for summary in self.iter_summaries():
            try:
                entry = self.fetch_entry_the_straightfwd_way (summary)
            except (HTTPStatus404, self.UnpublishedEntry) as err:
                entry = None #self.fetch_entry_from_wayback_machine (summary)
                if entry is None:
                    continue
            yield entry

#----------------------------------------------------------------------------------------------------------------------------------

class HtmlRenderer (object):

    @staticmethod
    def output_html_shell (title, body_str):
        return build_etree (
            'html',
            ('head',
             ('meta', {'charset': 'UTF-8'}),
             ('style', '''
                h2 { margin: 0 }
                h3 { margin: 0 }
                p.extra-header { margin: 0 }
                div.entry { margin-left: 5em; margin-right: 5em; }
                div.entry hr { width: 10em }
                table.headers { margin-bottom: 1em }
                hr.entry-separator { width: 90% !important; margin: 3em; }
                p.url { margin: 0 }
              '''),
             ('title', title)),
            ('body',
             ('h1', title),
             body_str)
            )

    @staticmethod
    def entry_as_html (blog, entry, prev_entry=None):
        if entry.datetime:
            entry_datetime_str = entry.datetime.strftime (blog.datetime_strf)
            if blog.include_time_elapsed_btw_entries and prev_entry and prev_entry.datetime:
                entry_datetime_str += ' (%s)' % imprecise_but_intuitive_timedelta_str (prev_entry.datetime, entry.datetime)
        return build_etree (
            'div.entry',
            ('table.headers', {'width': '100%'},
             ('tr',
              ('td',
               ('h2', entry.title) if entry.title else '',
               ('h3', 'by %s' % entry.author_name) if entry.author_name else '',
               ),
              ('td', {'align': 'right'},
               ('h3', entry_datetime_str) if entry.datetime else '',
               ('p.url', ('a', {'href': entry.url}, entry.url)),
               ) + tuple (('p.extra-header', ('b', '%s:' % key), u'\u00A0', val) for key,val in entry.extra_headers.iteritems() if val)
              )),
            entry.body_el,
            ('hr.entry-separator',)
        )

    def render (self, booklet, output_dir, iter_entries):
        # Some acrobatics required so that we can write the entries to file as they come, so that we don't have to store them in
        # memory, but still using ET for generating HTML
        mark_str = 'body goes here'
        html_str_header,html_str_footer = ET.tostring (
            self.output_html_shell (booklet.title, mark_str),
            method='html',
            encoding='UTF-8',
        ).split (mark_str)
        with open (pjoin (output_dir, 'index.html'), 'wb') as fh_out:
            fh_out.write (html_str_header)
            prev_entry = None
            for entry,blog in iter_entries:
                fh_out.write (
                    ET.tostring (
                        self.entry_as_html (blog, entry, prev_entry),
                        method='html',
                        encoding='UTF-8'
                    )
                )
                prev_entry = entry
            fh_out.write (html_str_footer)

#----------------------------------------------------------------------------------------------------------------------------------

class PdfRenderer(object):

    header_str = (
        "\\documentclass[]{article}\n"
        + "\\usepackage[T1]{fontenc}\n" 
        + "\\usepackage[utf8x]{inputenc}\n" 
        + "\\usepackage{graphicx}\n" 
        + "\\usepackage[bulgarian]{babel}\n"
        + "\\usepackage{ucs}\n" # not sure what this does but it stopped it choking on the romanian "s" with a cedilla
        + "\\usepackage[normalem]{ulem}\n" # makes \em underline
        + "\\usepackage{vmargin}\n" 
        + "\\setpapersize{A5}\n" 
        + "\\setmarginsrb{30pt}{20pt}{30pt}{15pt}{0pt}{0pt}{0pt}{15pt}\n"
    )

    def __init__ (self):
        self.converter = HtmlToLatexConverter()

    def entry_tex_str (self, blog, entry):
        for img_el in entry.body_el.xpath('.//img[@src]'):
            if '//' not in img_el.get('src'):
                img_el.set ('local-src', img_el.get('src'))
        return "\\section*{%s}" % self.converter.text(entry.title) \
            + "\\begin{flushright}\n" \
            + (("\\textit{%s}\n" % entry.datetime.strftime(blog.datetime_strf)) if entry.datetime else '') \
            + "\\end{flushright}\n" \
            + self.converter.convert (entry.body_el)

    def render (self, booklet, output_dir, iter_entries):
        tex_str = self.header_str \
            + "\\begin{document}\n" \
            + "\n\n".join (self.entry_tex_str(blog,entry) for entry,blog in iter_entries) \
            + "\\end{document}\n"
        tmp_dir = mkdtemp ()
        try:
            tex_file = pjoin (tmp_dir, 'booklet.tex')
            pdf_file = tex_file.replace ('.tex', '.pdf')
            with open (tex_file, 'wb') as fh:
                fh.write (tex_str.encode('UTF-8'))
            copy (tex_file, output_dir)
            symlink (abspath(pjoin(output_dir,'dependencies')), pjoin(tmp_dir,'dependencies'))
            subprocess.check_call (
                ['pdflatex', '-halt-on-error', tex_file],
                cwd = tmp_dir,
            )
            copy (pdf_file, output_dir)
        finally:
            rmtree (tmp_dir)

#----------------------------------------------------------------------------------------------------------------------------------
# main

ALL_BOOKLETS = (
    Booklet (
        id = 'iraq',
        title = 'Blood, Bullets, Bombs, and Bandwidth',
        blogs = (
            BBBB(),
            GiantLaser(
                min_datetime = datetime (2003,7,15),
                max_datetime = datetime (2005,4,27),
            ),
            SlowNewsDay(
                max_datetime = datetime (2005,4,26),
            ),
        )
    ),
    Booklet (
        id = 'idlewords',
        title = 'Idle Words',
        blogs = [IdleWords()],
    ),
)

ALL_RENDERERS = {
    'html': HtmlRenderer,
    'pdf': PdfRenderer,
}

def iter_blog_entries_with_dependencies (dependencies, blog):
    for entry in blog.iter_entries():
        dependencies.fetch_all (blog.http, entry.body_el, blog.html_encoding)
        yield entry

def prepare_output_dir (booklet):
    booklet_dir = pjoin (dirname(__file__), 'output', booklet.id)
    if exists (booklet_dir):
        raise Exception, "%s exists -- won't overwrite" % repr(booklet_dir)
    makedirs (booklet_dir)
    return booklet_dir

def main (booklet, renderer):
    booklet_dir = prepare_output_dir (booklet)
    dependencies = DependencyManager (
        booklet_dir,
        'dependencies',
        latex_constraints = isinstance (renderer, PdfRenderer),
    )
    iter_entries = merge_sorted_iterators (
        (
            (entry, blog)
            for entry in iter_blog_entries_with_dependencies (dependencies, blog)
        )
        for blog in booklet.blogs
    )
    renderer.render (booklet, booklet_dir, iter_entries)

#----------------------------------------------------------------------------------------------------------------------------------
# parse cmd line

if __name__ == '__main__':
    if len(argv) == 3:
        main(
            booklet = one (ALL_BOOKLETS, lambda b: b.id == argv[1]),
            renderer = ALL_RENDERERS[argv[2]](),
        )
    else:
        print >> stderr, "usage: %s <booklet-id> <renderer>" % argv[0]
        exit(2)

#----------------------------------------------------------------------------------------------------------------------------------