For checkouts or to view logs direct your SVN client to svn://svn.saintamh.org/code/recess/py/recess.py

#!/usr/bin/env python

"""
$Id: recess.py 3189 2017-05-14 21:04:58Z herve $
"""

#----------------------------------------------------------------------------------------------------------------------------------
# includes

# Python standards
import collections
import datetime
import itertools
import os
import re
import hashlib
import string
import traceback

# libs loaded via SVN externals
from saintamh.feeds import Feed
from saintamh.http import SimpleHTTPClient
from saintamh.struct import *
from saintamh.util.cache import cache_mru
from saintamh.util.dates import hours
from saintamh.util.etree import build_etree,remove_node,walk_subtree_allowing_edits
from saintamh.util.html import html_etree
from saintamh.util.mysqldb import MySQLDB
from saintamh.util.paths import here, makedirs
from saintamh.util.scrapers import extract_text,make_all_urls_absolute,remove_all_js

# 3rd party libs
import lxml.etree as ET

#----------------------------------------------------------------------------------------------------------------------------------
# data structs

MAX_SUBSCRIPTION_LABEL_LEN = 64

class Wire (struct (

        # Wire is distinct from saintamh.feeds.Feed for a few reasons. Firstly, this class represents the thing in our DB that you
        # subscribe to, while Feed represents the data that comes in the rss file. Their contents overlap a lot, but really this
        # one is what's shown in the reader, Feed is what's contained in the XML files, and their contents might differ or be
        # discovered at different times. Secondly, I didn't want to tie this central class to an external lib too much.

        id = sql_numeric_id,

        # FIXME - 2011-10-08 - Set maxlen to 1000 so that I can have a UNIQUE constraint on this column, but really I would like to
        # allow the 2000 bytes I usually give URLs. What's the bext approach here?
        xml_url = {
            'class': str,
            'regex': r'^https?://',
            'maxlen': 1000,
            },

        html_url = nullable (absolute_http_url),
        default_title = nullable (unicode),
        excluded_from_backups = nullable (bool),
        activated = bool,
        )):
    pass


class WEntry (struct (
        # FIXME - 2011-10-08 - better name (posts?)

        id = sql_numeric_id,
        wire_id = sql_foreign_numeric_id,
        url = nullable (absolute_http_url),
        title = nullable (unicode),
        author = nullable (unicode),
        contents = nullable (unicode, use_mysql_longtext=True),
        published_datetime = nullable (datetime.datetime),
        discovery_datetime = datetime.datetime,
        effective_datetime = datetime.datetime,

        # The fingerprints serves to determine whether two instances of this class are the same wentry. Two wentries should have
        # the same unique_fingerprint iff they are the same entry. This is meant to determine whether updates in the contents
        # should overwrite a previous wentry or result in a new one being inserted. If two entries have the same unique_fingerprint
        # but differing content_fingerprints, then they are really two revisions of the same entry.
        unique_fingerprint = hashlib_md5_hexdigest,
        content_fingerprint = hashlib_md5_hexdigest,
        )):

    def __init__ (self, **kwargs):
        if not kwargs.get ('unique_fingerprint'):
            kwargs['unique_fingerprint'] = WEntry.compute_unique_fingerprint (**kwargs)
        if not kwargs.get ('content_fingerprint'):
            kwargs['content_fingerprint'] = WEntry.compute_content_fingerprint (**kwargs)
        if not kwargs.get ('effective_datetime'):
            kwargs['effective_datetime'] = kwargs.get('published_datetime') or kwargs.get('discovery_datetime')
        super(WEntry,self).__init__ (**kwargs)

    @classmethod
    def compute_unique_fingerprint (cls, **kwargs):
        # 2011-10-14 - guids - RSS feeds may include a guid string with every entry, which is supposed to save us the trouble of
        # implementing this, but after reading http://www.xn--8ws00zhy3a.com/blog/2006/08/rss-dup-detection I decided I'd rather
        # implement it myself than trust outside sources. Let's see how that works out.
        if kwargs.get ('url'):
            return hashlib.md5(kwargs['url']).hexdigest()
        else:
            return cls.compute_content_fingerprint (**kwargs)
    
    @classmethod
    def compute_content_fingerprint (cls, **kwargs):
        md5 = hashlib.md5()
        md5.update (repr(kwargs.get('title')))
        md5.update (repr(kwargs.get('contents')))
        # md5.update (repr(kwargs.get('datetime')))
        return md5.hexdigest()

    @classmethod
    def sanitize_contents (cls, wire, contents_str):
        if not contents_str:
            return None
        try:
            html = ET.HTML (contents_str)
        except Exception:
            html = None
        if html is None:
            # NB lxml sometimes returns None, e.g. if input is "\n"
            html = build_etree ('div', contents_str or '')
        else:
            html = make_all_urls_absolute (wire.xml_url, remove_all_js (html))
            for el in walk_subtree_allowing_edits (html):
                if el.tag in ('a','form','area') and 'target' in el.attrib:
                    del el.attrib['target']
                elif el.tag in ('iframe', 'link', 'style', 'base'):
                    # 2011-10-24 - decided to allow object and embed nodes
                    remove_node (el)
                elif el.get('style'):
                    # 2016-11-10 - commented this out to allow 'float', because I want it for the SkyScraperCity feed. Let's see
                    #              how long it takes before it comes back to bite me in the butt
                    # 2016-11-11 - answer: 24h. It makes images stick out of the wentry box. Put it back.
                    el.set ('style', re.sub (r'\bfloat\s*:\s*\w+', '', el.get('style')))
        return ET.tostring (html, encoding=unicode, method='html')

    @classmethod
    def from_feed_entry (cls, wire, feed_entry, discovery_datetime):
        discovery_datetime = discovery_datetime.replace (microsecond=0) # for compat with dates that have been thru MySQL, which discards this
        if feed_entry.url or feed_entry.title or feed_entry.contents:
            return WEntry (
                wire_id = wire.id,
                url = feed_entry.url,
                title = feed_entry.title,
                author = feed_entry.author,
                contents = cls.sanitize_contents (wire, feed_entry.contents),
                published_datetime = feed_entry.datetime,
                discovery_datetime = discovery_datetime,
                )


class User (struct (
        id = sql_numeric_id,
        name = {
            'class': str,
            'minlen': 2,
            'maxlen': 32,
            'regex': r'^\w+$',
            'coerce_value': string.lower,
            },
        admin = nullable (bool, default_value=False),
        readonly = nullable (bool, default_value=False),
        )):
    pass


class Folder (struct (
        id = sql_numeric_id,
        user_id = sql_foreign_numeric_id,
        parent_id = nullable (sql_foreign_numeric_id),
        label = unicode,
        )):
    pass


class Subscription (struct (
        user_id = sql_foreign_numeric_id,
        wire_id = sql_foreign_numeric_id,
        folder_id = sql_foreign_numeric_id,
        label = {
            'class': unicode,
            'minlen': 1,
            'maxlen': MAX_SUBSCRIPTION_LABEL_LEN,
            },
        active = bool,
        )):
    pass


class Stamp (struct (
        wentry_id = sql_foreign_numeric_id,
        user_id = sql_foreign_numeric_id,
        read = bool,
        starred = bool,
        vote = one_of (0,1,2,3,4,5,6),
        last_updated = datetime.datetime,

        # 2014-05-19 - It feels like a bit of a hack to add this field right into the main data struct, because it's so specific to
        # a process (the Pinboard backups) which is somewhat secondary, but it's also the simplest approach. We need to keep track
        # of pinboard status for each wentry, and already have a DB table with all wentries in it.
        archived_on_pinboard = bool,

        # These are only used to optimize the wentry selection query in DB.`wentry_list`. They need to be kept exact copies of
        # their counterparts in table `wentries'. There might be a better way.
        wire_id = sql_foreign_numeric_id,
        wentry_effective_datetime = datetime.datetime,
        )):

    VOTE_MEANINGS = {
        6: "Exceptionally interesting article, will take action based on it",
        5: "Interesting article, worth reading entirely",
        4: "Interesting headline, didn't read the article entirely",
        3: "My kind of news, and as such somewhat interesting, but does not really stand out",
        2: "Not interesting to me, could have been skipped",
        1: "Actually annoyed that this pops up in my RSS feeds",
        0: "Null vote (haven't voted yet)"
        }


class Traceback (struct (
        id = sql_numeric_id,
        tb_str = {
            'class': str,
            'maxlen': 1000,
            'nullable': True,
            'coerce_value': lambda s: (
                re.compile(r'(.{,500})\n.+\n(.{10,500})', re.S).sub (
                    lambda m: '%s\n...\n%s' % m.groups(),
                    s
                    ) if s and len(s) > 1000 else s
                ),
            },
        )):
    def __init__ (self, **kwargs):
        if not isinstance (kwargs.get('tb_str'), (None.__class__, str)):
            kwargs['tb_str'] = kwargs['tb_str'].encode ('UTF-8')
        super(Traceback,self).__init__ (**kwargs)


class WireUpdateError (struct (
        wire_id = sql_foreign_numeric_id,
        first_occ_datetime = datetime.datetime,
        last_occ_datetime = datetime.datetime,
        traceback_id = sql_foreign_numeric_id,
        num_occs = strictly_positive (int),
        )):
    pass


class WEntryListOptions (struct (
        # Contrarily to all the above, this one is not stored in the DB, it's only used to formalize params to some functions below
        # NB these values are duplicated in recess.js
        votes = nullable (
            dict_of (
                one_of (0,1,2,3,4,5,6),
                bool
                ),
            default_value = dict ((n,True) for n in xrange(7)),
            ),
        read = one_of ('read', 'unread', 'all', None, default='unread'),
        starred = one_of ('starred', 'starless',  'all', None, default='all'),
        search_query = {
            'class': unicode,
            'nullable': True,
            'coerce_type': lambda v: v or '',
            'coerce_value': string.strip,
            },
        order = one_of ('newest_first', 'oldest_first', 'wentry_id', 'random', 'natural', default='newest_first'),
        archived_on_pinboard = nullable (bool),

        selected_votes = lambda self: tuple (v for v,on in self.votes.iteritems() if on),
        all_votes_selected = lambda self: all (self.votes.get(n) for n in xrange(7)),
        )):
    pass


# Words longer than this are truncated
MAX_INDEXED_CHARS_PER_WORD = 25

class Vocab (struct (
        id = sql_numeric_id,
        word = {
            'class': unicode,
            'maxlen': MAX_INDEXED_CHARS_PER_WORD, 
            },
        )):
    __mysql_table_name__ = 'vocab'

class WordIndex (struct (
        vocab_id = sql_foreign_numeric_id,
        wentry_id = sql_foreign_numeric_id,
        )):
    __mysql_table_name__ = 'word_index'


#----------------------------------------------------------------------------------------------------------------------------------
# low-level utils

# This is the HTTP client used to fetch the RSS feeds
# 
http_client = SimpleHTTPClient (
    cache_path = here (__file__, '..', 'cache', 'http'),
    cache_life = hours (3),
    use_cookies = True,
    timeout = 30,
    )


#----------------------------------------------------------------------------------------------------------------------------------
# Cope operations

def fetch_feed (xml_url, wire_id=None, verbose=False):

    # About caching - There are two levels of disk caching here. One is the usual caching done by the HTTP client, as configured
    # above. This ensures that we don't re-request files from foreign servers more often than a set rate.
    # 
    # Then there's the last_fetched mechanism, which isn't a proper cache, but simply allows us to compare the "live" RSS bytes
    # with the RSS that we last parsed, and avoid even parsing the RSS document tree if it hasn't changed since we last saw it.
    # 
    last_fetched_dir = makedirs (here (__file__, '..', 'cache', 'last_fetched_xml'))
    with open (here (__file__, '..', '.location.secrets')) as fh:
        re_this_server,document_root = map (string.strip, fh)

    if verbose and wire_id:
        print
        print "Fetching wire %d..." % wire_id

    # If the URL is on the same server as Recess, try to read the file directly. This is somewhat hacky I guess, but has the
    # advantage that we don't need to cache these files, since fetching them is free, and so we'll be able to show updates to these
    # files faster. Which is important, because many of them (in my case) already are caches of remote content, and I don't want to
    # introduce yet another layer of caching.
    # 
    re_local_url = re.compile (
        # NB don't match if there is a ?query
        '^https?://%s/([^\?]+)$' % re_this_server
    )
    m = re_local_url.match (xml_url)
    local_file_path = m and os.path.join (document_root, m.group(1))

    last_fetched_file_path = wire_id and os.path.join (last_fetched_dir, '%d.xml' % wire_id)
    if last_fetched_file_path and os.path.isfile(last_fetched_file_path):
        with open (last_fetched_file_path, 'rb') as fh:
            xml_bytes_last_fetched = fh.read()
    else:
        xml_bytes_last_fetched = None

    if local_file_path and os.path.isfile (local_file_path) and not local_file_path.endswith('.cgi'):
        if verbose:
            print local_file_path
        with open (local_file_path, 'rb') as fh:
            xml_bytes_live = fh.read()
    else:
        xml_bytes_live = http_client.simple_get (xml_url)

    # If the live bytes are the same as the last_fetched bytes, don't waste CPU cycles parsing it
    if xml_bytes_live == xml_bytes_last_fetched:
        if verbose and wire_id:
            print "Wire %d skipped (RSS is same as last_fetched, byte for byte)" % wire_id
        return None,None

    if last_fetched_file_path:
        with open (last_fetched_file_path, 'wb') as fh:
            fh.write (xml_bytes_live)

    # FIXME - 2011-10-08 - feeds.py should use the encoding declared in the feed itself
    try:
        xml_uc = xml_bytes_live.decode ('UTF-8')
        encoding = 'UTF-8'
    except UnicodeError:
        xml_uc = xml_bytes_live.decode ('Windows-1252')
        encoding = 'Windows-1252'

    feed,all_entries = Feed.parse (xml_url, xml_uc, encoding)
    if verbose and wire_id:
        print "Wire %d updated" % wire_id
    return feed,all_entries


def subscribe (db, user, folder, url, verbose=False):
    feed,_ = fetch_feed (url, verbose=verbose)
    wire = db.wire_insert_if_new (
        Wire (
            xml_url = feed.xml_url,
            html_url = feed.html_url,
            default_title = feed.title,
            activated = True,
            )
        )
    if db.sub_by_user_and_wire_id (user, wire.id):
        # FIXME what if the sub is there, but active=0? We should flip the `active` bit, not error out
        raise Exception, "Already subscribed to that feed"
    sub = db.sub_insert_if_new (
        Subscription (
            wire_id = wire.id,
            user_id = user.id,
            folder_id = folder.id if folder else 0,
            label = feed.title[:MAX_SUBSCRIPTION_LABEL_LEN],
            active = True,
            )
        )
    db.stamp_all_wire_wentries_for_user (wire.id, user.id)
    return wire,sub


def unsubscribe (db, user, wire_id):
    db.sub_delete_by_user_and_wire_id (user, wire_id)
    if db.sub_count_by_wire(wire_id) == 0:
        db.wire_delete (wire_id)
    db.stamp_delete_by_wire_and_user (wire_id, user.id)

def update_wire (db, wire, verbose=False):
    now = datetime.datetime.now()
    try:
        feed,all_entries = fetch_feed (wire.xml_url, wire.id, verbose=verbose)
    except Exception, ex:
        error_tb_str = traceback.format_exc()[-1000:]
    else:
        error_tb_str = None
        if feed is not None:
            all_wentries = filter (None, tuple (
                WEntry.from_feed_entry (wire, entry, now)
                for entry in all_entries
            ))
            db.wentry_insert_all_if_new (wire, all_wentries)
    error_tb = Traceback(tb_str=error_tb_str)
    error = db.wire_update_error_report (wire, now, error_tb)
    return error,error_tb


def tokenize_text_for_index (txt):
    for m in re.finditer (r'\w+', txt, flags=re.UNICODE):
        word = m.group()
        if len(word) > MAX_INDEXED_CHARS_PER_WORD:
            word = word[:MAX_INDEXED_CHARS_PER_WORD]
        yield word.lower()

def extract_indexable_words (wentry):
    for txt in (wentry.title, wentry.contents):
        if txt:
            if '<' in txt:
                try:
                    txt = extract_text (html_etree (txt))
                except Exception:
                    raise # while debugging
                    txt = re.sub (r'<[^>]+>', '', txt)
            for word in tokenize_text_for_index (txt):
                yield word


#----------------------------------------------------------------------------------------------------------------------------------
# DB public interface

class DB (MySQLDB):

    def __init__ (self):
        with open (here (__file__, '..', '.mysql.secrets')) as fh:
            username,password = map (string.strip, fh)
        MySQLDB.__init__ (self, username, password, 'recess', 'localhost')

    def init_db (self):
        with self._transaction() as dbc:
            dbc.execute (Wire.mysql_stmt_create_table ('UNIQUE KEY (xml_url)'), [])
            dbc.execute (WEntry.mysql_stmt_create_table ('UNIQUE KEY (wire_id,unique_fingerprint)'), [])
            dbc.execute (User.mysql_stmt_create_table ('UNIQUE KEY (name)'), [])
            dbc.execute (Subscription.mysql_stmt_create_table ('PRIMARY KEY (user_id,wire_id), KEY active (active,wire_id,user_id)'), [])
            dbc.execute (Stamp.mysql_stmt_create_table ('PRIMARY KEY (wentry_id,user_id)'), [])
            dbc.execute (Folder.mysql_stmt_create_table ('UNIQUE KEY (user_id,parent_id,label(%d))' % MAX_SUBSCRIPTION_LABEL_LEN), [])
            dbc.execute (Traceback.mysql_stmt_create_table ('UNIQUE KEY (tb_str)'), [])
            dbc.execute (WireUpdateError.mysql_stmt_create_table ('PRIMARY KEY (wire_id,first_occ_datetime)'), [])
            dbc.execute (Vocab.mysql_stmt_create_table ('UNIQUE KEY (word)'), [])
            dbc.execute (WordIndex.mysql_stmt_create_table ('PRIMARY KEY (vocab_id,wentry_id)'), [])


    ### Wires

    def wire_load_all (self, only_wires_having_subs=True):
        with self._transaction() as dbc:
            if only_wires_having_subs:
                return dbc.load_all (
                    Wire,
                    "WHERE EXISTS (SELECT 1 FROM subscriptions WHERE active=b'1' AND wire_id=id)"
                )
            else:
                return dbc.load_all (Wire)

    def wire_all_by_subs (self, subs):
        if subs:
            with self._transaction() as dbc:
                return dbc.load_all (
                    Wire,
                    'WHERE `id` IN (%s)' % ','.join ('%s' for _ in subs),
                    [s.wire_id for s in subs]
                    )
        else:
            return []

    def wire_insert_if_new (self, wire):
        with self._transaction() as dbc:
            dbc.insert (wire, ignore_duplicates=True)
            return dbc.load_one (Wire, 'WHERE xml_url=%s', [wire.xml_url], escape_nulls=True)

    def wire_delete (self, wire_id):
        with self._transaction() as dbc:
            dbc.delete (Wire, 'WHERE `id`=%s', [wire_id])
            dbc.delete (WEntry,          'WHERE NOT EXISTS (SELECT 1 FROM wires where id=wire_id)')
            dbc.delete (WireUpdateError, 'WHERE NOT EXISTS (SELECT 1 FROM wires WHERE id=wire_id)')
            dbc.delete (Subscription,    'WHERE NOT EXISTS (SELECT 1 FROM wires WHERE id=wire_id)')
            dbc.delete (Stamp, 'WHERE NOT EXISTS (SELECT 1 FROM wentries WHERE id=wentry_id)')

            # FIXME - 2012-08-01 - this last one should also be performed after every update. I had 500K orphaned ones today.
            dbc.delete (Traceback, 'WHERE NOT EXISTS (SELECT 1 FROM wire_update_errors WHERE traceback_id=id)')


    ### WEntries

    def wentry_by_id (self, wentry_id):
        with self._transaction() as dbc:
            return dbc.load_one (WEntry, 'WHERE `id`=%s', [wentry_id])

    def wentry_insert_all_if_new (self, wire, all_wentries):
        assert all (isinstance(we,WEntry) for we in all_wentries)
        wire_subs = None
        now = datetime.datetime.now()
        with self._transaction() as dbc:
            for wentry in all_wentries:
                existing_copy = dbc.load_one (
                    WEntry,
                    'WHERE `wire_id`=%s AND `unique_fingerprint`=%s',
                    [wire.id, wentry.unique_fingerprint],
                    )
                if existing_copy:
                    # 2015-10-18 - I'm commenting out this whole paragraph. When I started recess I thought that keeping entries in
                    # sync with their source (i.e. updating the entry in Recess when it changes in the RSS feed) made good sense,
                    # but now that I use Recess as an archive of things I've read, this opens the door to me saving in Recess a
                    # text I liked, only for it to be changed from under my feet. I think this functionality brings little and is
                    # dangerous. So I'm switching it off.
                    # 
                    # I'm keeping the code here, in commented-out form, in case I realize in 2 days that this was a big mistake. If
                    # by 2016 I still don't see a reason to put it back, delete the whole thing.
                    # 
                    # existing_fields,updated_fields = (
                    #     tuple (
                    #         (f,v)
                    #         for f,v in zip (WEntry.mysql_ordered_field_ids(), we.mysql_obj2row())
                    #         if f not in ('id', 'wire_id', 'unique_fingerprint', 'discovery_datetime', 'effective_datetime')
                    #         )
                    #     for we in (existing_copy,wentry)
                    #     )
                    # if existing_fields != updated_fields:
                    #     if existing_copy.published_datetime != wentry.published_datetime:
                    #         updated_fields += ((
                    #                 'effective_datetime',
                    #                 wentry._typedefs['effective_datetime'].mysql_marshal_value_to_string(wentry.effective_datetime),
                    #                 ),)
                    #     dbc.execute (
                    #         'UPDATE `wentries` SET %s WHERE `wire_id`=%%s AND `unique_fingerprint`=%%s' % ','.join (
                    #             '`%s`=%%s' % f
                    #             for f,v in updated_fields
                    #             ),
                    #         [v for f,v in updated_fields] + [wire.id, wentry.unique_fingerprint],
                    #         )
                    #     if existing_copy.effective_datetime != wentry.effective_datetime:
                    #         dbc.execute (
                    #             'UPDATE `stamps` SET `wentry_effective_datetime`=%s WHERE `wentry_id`=%s',
                    #             [wentry.effective_datetime, existing_copy.id]
                    #             )
                    # wentry = existing_copy
                    pass
                else:
                    try:
                        insert_id = dbc.insert (wentry)
                    except Exception:
                        from sys import exc_info
                        exc_type, exc_value, exc_tb = exc_info()
                        raise exc_type, '%s (%s)' % (exc_value, wire.id), exc_tb
                    wentry = wentry.derive (id=insert_id)
                    if wire_subs is None:
                        wire_subs = self.sub_all_by_wire (wire.id)
                    for sub in wire_subs:
                        dbc.insert (
                            Stamp (
                                wentry_id = wentry.id,
                                user_id = sub.user_id,
                                read = False,
                                starred = False,
                                vote = 0,
                                last_updated = now,
                                wire_id = wire.id,
                                wentry_effective_datetime = wentry.effective_datetime,
                                archived_on_pinboard = False,
                                )
                            )

    def wentry_list (self, user, wire_ids, list_opts=WEntryListOptions(), limit=10, offset=0):
        with self._transaction() as dbc:

            # The DB contains wentries for wires we're no longer subscribed to. If wire_ids are given, we assume they're all
            # correct, but if not, we explicitly select all wires this user subscribed to
            if not wire_ids:
                dbc.execute ('SELECT wire_id FROM subscriptions WHERE user_id=%s', [user.id])
                wire_ids = tuple(int(row[0]) for row in dbc.dbc.fetchall())

            # NB we depart from good practice by inserting some integer values directly into the SQL query here rather than
            # interpolating them with '%s' and passing them in the 2nd arg to the MySQL lib. This makes the code here more
            # readable, and as long as they're only integers, inserted using %d, I don't think there's a risk of MySQL injection.

            dbc.execute (
                ''' SELECT %(selected_wentry_cols)s,wentry_effective_datetime,archived_on_pinboard,last_updated,`read`,starred,vote
                    FROM wentries
                    INNER JOIN stamps
                         ON wentries.id=stamps.wentry_id
                    WHERE
                         user_id=%(user_id)d
                         %(wire_selector)s
                         %(vote_selector)s
                         %(read_selector)s
                         %(star_selector)s
                         %(text_selector)s
                         %(pinb_selector)s
                    %(order)s
                    %(limit)s
                    %(offset)s ''' %
                {
                    'user_id': user.id,
                    'selected_wentry_cols': ','.join (
                        'wentries.%s' % col
                        for col in WEntry.mysql_ordered_field_ids()
                        ),
                    'wire_selector': (
                        # NB the query optimizer performs better if you use the stamps.wire_id, rather than wentries.wire_id
                        'AND stamps.wire_id IN (%s)' % ','.join ('%d' % i for i in wire_ids)
                        ),
                    'vote_selector': (
                        '' if list_opts.all_votes_selected or not list_opts.selected_votes else
                        'AND vote IN (%s)' % ','.join ('%d' % v for v in list_opts.selected_votes)
                        ),
                    'read_selector': {
                        'unread': ' AND `read` = 0',
                        'read':   ' AND `read` = 1',
                        'all':       '',
                        }[list_opts.read],
                    'star_selector': {
                        'starless' : ' AND starred = 0',
                        'starred':   ' AND starred = 1',
                        'all':       '',
                        }[list_opts.starred],
                    'pinb_selector': \
                        "AND archived_on_pinboard=b'%s'" % int(list_opts.archived_on_pinboard) \
                        if list_opts.archived_on_pinboard is not None else '',
                    'text_selector': ' AND MATCH(title,contents) AGAINST (%s)' if list_opts.search_query else '',
                    'limit':  'LIMIT %d'  % limit  if limit  else '',
                    'offset': 'OFFSET %d' % offset if offset else '',
                    'order': {
                        'newest_first': 'ORDER BY stamps.wentry_effective_datetime DESC',
                        'oldest_first': 'ORDER BY stamps.wentry_effective_datetime ASC',
                        'wentry_id': 'ORDER BY wentry_id ASC', # FIXME not indexed as of 2013-10-18. But this isn't for normal usage.
                        'random': 'ORDER BY RAND()', # 2014-01-22 - apparently this is inefficient. Fast enough for me though
                        'natural': '',
                        }[list_opts.order],

                    },

                (
                    [list_opts.search_query]
                    if list_opts.search_query
                    else []
                    )
                )
            for row in tuple(dbc.dbc.fetchall()):
                wentry = WEntry.mysql_row2obj (row[:-5])
                yield wentry, Stamp (
                    wentry_id = wentry.id,
                    user_id = user.id,
                    read = (row[-3] == '\x01'),
                    starred = (row[-2] == '\x01'),
                    vote = row[-1],
                    last_updated = row[-4],
                    archived_on_pinboard = row[-5],
                    wire_id = wentry.wire_id,
                    wentry_effective_datetime = row[-6],
                    )

    def wentry_iter_all (self, dbc, where='1'):
        page_size = 50
        for page_i in itertools.count (0, step=page_size):
            page_wentries = dbc.load_all (WEntry, 'WHERE %s ORDER BY `id` LIMIT %d,%d' % (where,page_i,page_size))
            if page_wentries:
                for wentry in page_wentries:
                    yield wentry
            else:
                break


    ### Users

    def user_by_name (self, username):
        if username:
            with self._transaction() as dbc:
                return dbc.load_one (User, 'WHERE name=%s', [username], escape_nulls=True)


    ### Subscriptions

    def sub_all_by_user (self, user, active_only):
        with self._transaction() as dbc:
            where_stmt = 'WHERE user_id=%s'
            where_args = [user.id]
            if active_only:
                where_stmt += " AND active=b'1'"
            return dbc.load_all (Subscription, where_stmt, where_args, escape_nulls=True)

    def sub_all_by_wire (self, wire_id):
        with self._transaction() as dbc:
            # FIXME not indexed
            return dbc.load_all (Subscription, 'WHERE wire_id=%s', [wire_id])

    def sub_count_by_wire (self, wire_id):
        with self._transaction() as dbc:
            # FIXME not indexed
            return dbc.count (Subscription, 'WHERE wire_id=%s', [wire_id])

    def sub_by_user_and_wire_id (self, user, wire_id):
        with self._transaction() as dbc:
            return dbc.load_one (Subscription, 'WHERE user_id=%s AND wire_id=%s', [user.id, wire_id])

    def sub_insert_if_new (self, sub):
        with self._transaction() as dbc:
            new_id = dbc.insert (sub, ignore_duplicates=True)
            if new_id:
                sub = sub.derive (id=new_id)
            return sub

    def sub_delete_by_user_and_wire_id (self, user, wire_id):
        with self._transaction() as dbc:
            dbc.delete (Subscription, 'WHERE user_id=%s AND wire_id=%s', [user.id, wire_id])

    def sub_save (self, sub):
        with self._transaction() as dbc:
            dbc.replace (sub)
        return sub


    ### Folders

    def folder_insert_if_new (self, folder, error_if_already_exists=True):
        with self._transaction() as dbc:
            new_id = dbc.insert (
                folder,
                ignore_duplicates = not error_if_already_exists,
            )
            if new_id:
                folder = folder.derive(id=new_id)
            # NB - 2011-10-10 - Folder objects that existed in the DB but didn't have their ID set coming in won't have their ID
            # set going out, is that what we want?
            return folder

    def folder_all_by_user (self, user):
        with self._transaction() as dbc:
            return dbc.load_all (Folder, 'WHERE user_id=%s', [user.id], escape_nulls=True)

    def folder_by_id (self, id):
        with self._transaction() as dbc:
            return dbc.load_one (Folder, 'WHERE id=%s', [id])

    def folder_by_label (self, user, label):
        with self._transaction() as dbc:
            # FIXME not indexed
            return dbc.load_one (Folder, 'WHERE user_id=%s AND label=%s', [user.id, label])

    def folder_save (self, folder):
        with self._transaction() as dbc:
            dbc.replace (folder)
        return folder

    def folder_delete (self, folder_id):
        with self._transaction() as dbc:
            if dbc.count (Folder, 'WHERE parent_id=%s', [folder_id]):
                raise ValueError ("Cannot delete: folder has subfolders")
            if dbc.count (Subscription, 'WHERE folder_id=%s', [folder_id]):
                raise ValueError ("Cannot delete: folder has subscriptions")
            return dbc.delete (Folder, 'WHERE id=%s', [folder_id])


    ### Tracebacks

    @cache_mru (25, key=lambda self,tb,*rest: tb)
    def _traceback_insert_if_new (self, tb, dbc):
        dbc.insert (tb, ignore_duplicates=True)
        return dbc.load_one (Traceback, 'WHERE tb_str=%s', [tb.tb_str], escape_nulls=True)


    ### Wire Update Errors

    def wire_update_error_report (self, wire, dt, tb):
        with self._transaction() as dbc:
            tb = self._traceback_insert_if_new (tb, dbc)
            prev = dbc.load_one (
                WireUpdateError,
                'WHERE wire_id=%s ORDER BY first_occ_datetime DESC LIMIT 1',
                [wire.id],
                escape_nulls=True
            )
            if prev and prev.traceback_id == tb.id:
                dbc.replace (
                    prev.derive (
                        num_occs = prev.num_occs + 1,
                        last_occ_datetime = dt
                    )
                )
                return prev
            else:
                error = WireUpdateError (
                    wire_id = wire.id,
                    traceback_id = tb.id,
                    num_occs = 1,
                    first_occ_datetime = dt,
                    last_occ_datetime = dt,
                )
                dbc.insert (error)
                return error

    def wire_update_error_lookup_tb_str (self, wires):
        if not wires:
            return {}
        with self._transaction() as dbc:
            null_tb = self._traceback_insert_if_new (Traceback(tb_str=None), dbc)                

            dbc.execute (
                ''' SELECT %s
                    FROM wire_update_errors w
                    INNER JOIN (
                            SELECT wire_id,MAX(first_occ_datetime) first
                            FROM wire_update_errors
                            GROUP BY wire_id
                            ) first
                        ON  w.wire_id=first.wire_id
                        AND w.first_occ_datetime=first.first
                    ''' % (
                        ','.join ('w.%s' % col for col in WireUpdateError.mysql_ordered_field_ids()),
                    ), []
                )
            errors = dict (
                (e.wire_id,e)
                for e in map (WireUpdateError.mysql_row2obj, dbc.dbc.fetchall())
                )

            tb_ids = sorted (set (e.traceback_id for e in errors.itervalues()))
            tbs = dict (
                (tb.id,tb)
                for tb in dbc.load_all (
                    Traceback,
                    'WHERE id in (%s)' % ','.join ('%d' % i for i in tb_ids),
                    []
                    )
                )

            ret = {}
            for w in wires:
                if w.id in errors:
                    ret[w.id] = (errors[w.id],tbs[errors[w.id].traceback_id])
                else:
                    # This happens if the wire has never been updated
                    now = datetime.datetime.now()
                    ret[w.id] = (
                        WireUpdateError (
                            wire_id = w.id,
                            traceback_id = null_tb.id,
                            num_occs = 1,
                            first_occ_datetime = now,
                            last_occ_datetime = now,
                            ),
                        null_tb
                        )

            return ret


    ### Stamps

    def stamp_load_one (self, user_id, wentry_id):
        with self._transaction() as dbc:
            return dbc.load_one (
                Stamp,
                'WHERE `user_id`=%s AND `wentry_id`=%s',
                [user_id, wentry_id]
            )

    def stamp_save (self, stamp):
        with self._transaction() as dbc:
            dbc.replace (stamp)

    def stamp_count_unread_wentries (self, user, all_wire_ids=None):
        with self._transaction() as dbc:
            dbc.execute (
                ''' SELECT wire_id,COUNT(*)
                    FROM stamps
                    WHERE
                        user_id = %(user_id)d
                        AND `read` = 0
                        %(wire_id_selector)s
                    GROUP BY wire_id 
                ''' % {
                    'user_id': user.id,
                    'wire_id_selector': (
                        'AND wire_id IN (%s)' % ','.join ('%d' % i for i in all_wire_ids)
                        if all_wire_ids else ''
                        ),
                    },
                []
                )
            return dict (dbc.dbc.fetchall())

    def stamp_all_wire_wentries_for_user (self, wire_id, user_id, read=0, starred=0, vote=0):
        now_str = datetime.datetime.now().strftime ('%Y-%m-%d %H:%M:%S')
        with self._transaction() as dbc:
            dbc.execute (
                ''' INSERT IGNORE
                    INTO   `stamps` (`wentry_id`, `user_id`,   `read`,   `starred`,   `vote`, `last_updated`, `wire_id`, `wentry_effective_datetime`)
                    SELECT           `id`,       %(user_id)d, %(read)d, %(starred)d, %(vote)d, "%(now_str)s", %(wire_id)d, effective_datetime
                    FROM `wentries`
                    WHERE wire_id=%(wire_id)d
                ''' % locals(),
                []
                )

    def stamp_mark_as_read (self, user_id, wentry_ids):
        with self._transaction() as dbc:
            dbc.execute (
                ''' UPDATE stamps SET `read`=1 WHERE user_id=%d AND wentry_id IN (%s) ''' % (
                    user_id,
                    ','.join ('%d' % i for i in wentry_ids)
                    ),
                [],
                )

    def stamp_vote_tallies (self, user_id):
        with self._transaction() as dbc:
            dbc.execute (
                ''' SELECT wires.*,wire_id,vote,COUNT(*)
                    FROM stamps
                    JOIN wires ON id=wire_id
                    WHERE user_id=%s
                    GROUP by wire_id,vote
                ''',
                user_id,
                )
            tallies = collections.defaultdict (dict)
            for row in tuple(dbc.dbc.fetchall()):
                wire = Wire.mysql_row2obj (row[:-3])
                vote,count = row[-2:]
                tallies[wire][vote] = count
            return tallies

    def stamp_delete_by_wire_and_user (self, wire_id, user_id):
        with self._transaction() as dbc:
            dbc.delete (Stamp, 'WHERE wire_id=%s AND user_id=%s', [wire_id, user_id])


    ### vocab and word index

    def vocab_clear (self, dbc):
        dbc.execute ('TRUNCATE TABLE `vocab`', [])

    def vocab_id (self, dbc, word, insert_if_missing=False):
        dbc.execute ('SELECT `id` FROM `vocab` WHERE `word`=%s', [word])
        row = dbc.dbc.fetchone()
        if row:
            return row[0]
        elif insert_if_missing:
            dbc.execute ('INSERT INTO `vocab` SET `word`=%s', [word])
            return self.db.insert_id()
        else:
            return None

    def word_index_clear (self, dbc):
        dbc.execute ('TRUNCATE TABLE `word_index`', [])

    def word_index_add (self, dbc, wentry_id, vocab_id, ignore_duplicates=False):
        dbc.execute (
            'INSERT ' + ('IGNORE ' if ignore_duplicates else '') + 'INTO `word_index` SET `wentry_id`=%s,`vocab_id`=%s',
            [wentry_id, vocab_id],
            )


#----------------------------------------------------------------------------------------------------------------------------------