improved debug output, included html2text

This commit is contained in:
2010-10-31 13:30:26 +01:00
parent 18a71faee7
commit 8871785cb1
2 changed files with 472 additions and 14 deletions

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python
from sqlalchemy import create_engine, Table, Column, Integer, Text, Boolean, DateTime, MetaData, ForeignKey
from sqlalchemy import create_engine, Table, Column, Integer, Text, Boolean, DateTime, MetaData, ForeignKey, desc
from sqlalchemy.orm import sessionmaker, relation, backref
from sqlalchemy.ext.declarative import declarative_base
import datetime
@ -9,6 +9,7 @@ import re
import sys
import urllib
import hn
import html2text
Base = declarative_base()
@ -150,7 +151,7 @@ def mail_daily_digest():
sender = 'atomstrom'
body = ''
count = 0
for feed, feedinfo, entry in session.query(Feed, Feedinfo, Entry).filter(Feed.id==Feedinfo.feed_id).filter(Feed.id==Entry.feed_id).filter(Feed.enabled==1).filter(Feed.daily==1).order_by(Entry.firstfetched).all():
for feed, feedinfo, entry in session.query(Feed, Feedinfo, Entry).filter(Feed.id==Feedinfo.feed_id).filter(Feed.id==Entry.feed_id).filter(Feed.enabled==1).filter(Feed.daily==1).order_by(desc(Entry.firstfetched)).all():
count = count + 1
body = body + '=> %s - %s\n' % (entry.firstfetched.strftime('%y%m%d-%H%M'), feedinfo.title)
body = body + ' %s\n' % entry.title
@ -183,22 +184,29 @@ def fetch_readability(link):
def fetch_full_page(link):
opener = urllib.FancyURLopener({})
response = opener.open(link)
return response.read()
html = response.read()
html = html.decode('utf8')
text = html2text.html2text(html)
return text.encode('latin-1', 'replace')
def process_feed_entry(feed, entry):
query = session.query(Entry).filter_by(feed_id=feed.id, title=entry.title.encode('latin-1', 'replace'))
try:
thisentry = query.one()
thisentry.update(entry)
return '-'
print ' entry already known <%s>' % entry.title
return 0
except Exception, e:
print ' new entry <%s>' % entry.title
thisentry = Entry(entry)
if feed.fullpage:
print ' fetching full page <%s>' % entry.link
thisentry.fullpage = fetch_full_page(entry.link)
if feed.readability:
print ' fetching readability <%s>' % entry.link
thisentry.readability = fetch_readability(entry.link)
feed.entry.append(thisentry)
return '+'
return 1
def fetch_single_feed(feed):
print 'fetching %s' % feed.url
@ -209,18 +217,17 @@ def fetch_single_feed(feed):
feed.feedinfo = query.one()
feed.feedinfo.update(parser)
except Exception, e:
print 'this feed seems to be new'
feed.feedinfo = Feedinfo(parser)
print 'processing feed entries: ',
print 'processing feed entries:'
entries_new = 0
entries_total = 0
for entry in parser.entries:
entries_total = entries_total + 1
ret = process_feed_entry(feed, entry)
if ret == '+':
entries_new = entries_new + 1
sys.stdout.write(ret)
print ' (%d/%d new)' % (entries_new, entries_total)
entries_new = entries_new + process_feed_entry(feed, entry)
session.commit()
print 'fetched %d from %d entries' % (entries_total, entries_new)
def fetch_all_feeds():
print 'fetching all feeds...'
@ -229,8 +236,8 @@ def fetch_all_feeds():
print
if __name__ == '__main__':
#fetch_all_feeds()
#mail_single_entries()
mail_daily_digest()
fetch_all_feeds()
mail_single_entries()
#mail_daily_digest()
session.commit()