Atomstrom/atomstrom.py

375 lines
13 KiB
Python
Raw Normal View History

2010-10-26 23:02:37 +02:00
#!/usr/bin/env python
#coding: utf-8
2010-10-26 23:02:37 +02:00
from sqlalchemy import create_engine, Table, Column, Integer, Text, String, Boolean, DateTime, MetaData, ForeignKey, desc
2010-10-26 23:02:37 +02:00
from sqlalchemy.orm import sessionmaker, relation, backref
from sqlalchemy.ext.declarative import declarative_base
2013-03-19 21:18:38 +01:00
from datetime import datetime, timedelta
2013-03-18 20:54:00 +01:00
from time import mktime
2010-10-26 23:02:37 +02:00
import feedparser
import re
import sys
2010-10-29 09:09:25 +02:00
import urllib
2013-03-20 20:30:54 +01:00
import urllib2
2010-10-30 00:21:24 +02:00
import hn
import html2text
import ConfigParser
2013-03-19 21:18:38 +01:00
import pprint
from optparse import OptionParser
from cStringIO import StringIO
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.header import Header
from email import Charset
from email.generator import Generator
import smtplib
2010-10-26 23:02:37 +02:00
Base = declarative_base()
class Feed(Base):
__tablename__ = 'feed'
id = Column(Integer, primary_key=True)
url = Column(Text)
2013-03-19 21:18:38 +01:00
frequency = Column(Integer)
2010-10-26 23:02:37 +02:00
daily = Column(Boolean)
2013-03-20 20:30:54 +01:00
resolveredirects = Column(Boolean)
2010-10-26 23:02:37 +02:00
readability = Column(Boolean)
fullpage = Column(Boolean)
html2textsummary = Column(Boolean)
html2textignoreimages = Column(Boolean)
2010-10-26 23:02:37 +02:00
enabled = Column(Boolean)
def __init__(self, url, daily, readability, fullpage, enabled, html2textsummary):
2010-10-26 23:02:37 +02:00
self.url = url
self.daily = daily
self.readability = readability
self.fullpage = fullpage
self.html2textsummary = html2textsummary
2010-10-26 23:02:37 +02:00
self.enabled = enabled
def __repr__(self):
return "<Feed('%s','%s','%s')>" % (self.url, self.daily, self.readability)
class Feedinfo(Base):
__tablename__ = 'feedinfo'
id = Column(Integer, primary_key=True)
feed_id = Column(Integer, ForeignKey('feed.id'))
feed = relation("Feed", backref=backref('feedinfo', uselist=False))
title = Column(String(255))
link = Column(String(255))
subtitle = Column(String(255))
author = Column(String(255))
publisher = Column(String(255))
2010-10-26 23:02:37 +02:00
status = Column(Integer)
version = Column(String(16))
encoding = Column(String(16))
2010-10-26 23:02:37 +02:00
bozo = Column(Integer)
2010-10-26 23:02:37 +02:00
lastfetched = Column(DateTime)
lastsuccessful = Column(DateTime)
def __init__(self, parser):
self.update(parser)
def __repr__(self):
return "<Feedinfo('%s','%s','%s')>" % (self.title, self.subtitle, self.author)
def update(self, parser):
if parser.feed.has_key('title'):
self.title = parser.feed.get('title')
2010-10-26 23:02:37 +02:00
if parser.feed.has_key('link'):
self.link = parser.feed.get('link')
if parser.feed.has_key('subtitle'):
self.subtitle = parser.feed.get('subtitle')
2010-10-26 23:02:37 +02:00
if parser.feed.has_key('author'):
self.author = parser.feed.get('author')
2010-10-26 23:02:37 +02:00
if parser.feed.has_key('publisher'):
self.author = parser.feed.get('publisher')
2010-10-26 23:02:37 +02:00
self.status = parser.get('status')
self.version = parser.get('version')
self.encoding = parser.get('encoding')
self.bozo = parser.get('bozo')
2013-03-18 20:54:00 +01:00
self.lastfetched = datetime.now()
2013-03-19 22:40:47 +01:00
if parser.get('status') == 200 or parser.get('status') == 302:
2013-03-18 20:54:00 +01:00
self.lastsuccessful = datetime.now()
2010-10-26 23:02:37 +02:00
class Entry(Base):
__tablename__ = 'entry'
id = Column(Integer, primary_key=True)
feed_id = Column(Integer, ForeignKey('feed.id'))
feed = relation("Feed", backref=backref('entry'))
title = Column(String(255))
link = Column(String(255))
2010-10-26 23:02:37 +02:00
summary = Column(Text)
content = Column(Text)
author = Column(String(255))
2010-10-26 23:02:37 +02:00
enclosures = Column(Text)
resolvedlink = Column(String(255))
fullpage = Column(Text)
2010-10-30 00:21:24 +02:00
readability = Column(Text)
2013-03-18 20:54:00 +01:00
updated = Column(DateTime)
firstfetched = Column(DateTime)
2010-10-26 23:02:37 +02:00
lastfetched = Column(DateTime)
sent = Column(DateTime)
2010-10-26 23:02:37 +02:00
def __init__(self, entry):
self.update(entry)
2013-03-18 20:54:00 +01:00
self.firstfetched = datetime.now()
2010-10-26 23:02:37 +02:00
def __repr__(self):
return "<Entry('%s','%s','%s')>" % (self.title, "", "")
def update(self, entry):
if entry.has_key('title'):
self.title = entry.get('title')
2010-10-26 23:02:37 +02:00
if entry.has_key('link'):
self.link = entry.get('link')
2010-10-26 23:02:37 +02:00
if entry.has_key('summary'):
self.summary = entry.get('summary')
2013-03-19 22:40:47 +01:00
if entry.has_key('content'):
self.content = entry.get('content')[0].value
2010-10-26 23:02:37 +02:00
if entry.has_key('author'):
self.author = entry.get('author')
2013-03-18 20:54:00 +01:00
if entry.has_key('updated_parsed'):
updated_parsed = entry.get('updated_parsed')
self.updated = datetime.fromtimestamp(mktime(updated_parsed))
2013-03-19 22:40:47 +01:00
if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0:
print 'enclosures';
pp=pprint.PrettyPrinter(depth=4)
pp.pprint(entry.get('enclosures'))
#self.enclosures = entry.get('enclosures')
2013-03-18 20:54:00 +01:00
self.lastfetched = datetime.now()
2010-10-26 23:02:37 +02:00
2013-03-19 20:09:44 +01:00
def send_mail(sender, receiver, subject, body):
print 'sending to %s: %s' % (receiver[0], subject)
Charset.add_charset('utf-8', Charset.QP, Charset.QP, 'utf-8')
mail = MIMEMultipart('alternative')
mail['Subject'] = "%s" % Header(subject, 'utf-8')
mail['From'] = "\"%s\" <%s>" % (Header(sender[0], 'utf-8'), sender[1])
mail['To'] = "\"%s\" <%s>" % (Header(receiver[0], 'utf-8'), receiver[1])
textpart = MIMEText(body, 'plain', 'utf-8')
mail.attach(textpart)
str_io = StringIO()
gen = Generator(str_io, False)
gen.flatten(mail)
s = smtplib.SMTP('localhost')
s.sendmail("", receiver[1], str_io.getvalue())
2010-10-30 11:16:37 +02:00
def get_entry_text(entry):
if entry.readability:
text = entry.readability
elif entry.fullpage:
text = entry.fullpage
elif entry.summary:
text = entry.summary
else:
text = 'no text, sorry'
return text
2013-03-19 19:32:10 +01:00
def mail_daily_digest(session, sender, receiver, prefix):
print 'mailing daily digest...'
2013-03-19 23:03:33 +01:00
entries = session.query(Feed, Feedinfo, Entry).\
filter(Feed.id == Feedinfo.feed_id).\
filter(Feed.id == Entry.feed_id).\
filter(Feed.enabled == 1).\
filter(Feed.daily == 1).\
filter(Entry.sent == None).\
order_by(desc(Entry.firstfetched), Feedinfo.title, Entry.title).\
all()
body = ''
count = 0
2013-03-19 23:03:33 +01:00
for feed, feedinfo, entry in entries:
count = count + 1
2013-03-20 20:30:54 +01:00
link = entry.link
if entry.resolvedlink:
link = entry.resolvedlink
2013-03-21 21:42:28 +01:00
try:
body = body + '=> %s - %s\n' % (entry.firstfetched.strftime('%y%m%d-%H%M'), feedinfo.title)
body = body + ' %s\n' % entry.title
2013-03-21 21:42:28 +01:00
body = body + '%s\n' % get_entry_text(entry)[0:100]
body = body + '%s\n\n' % link
except:
print 'ERROR processing entry %s' % entry.id;
print sys.exc_info()
print 'not sending mail'
return
2013-03-19 23:03:33 +01:00
if count > 0:
today = datetime.now()
subject = '%s (%s) - %d entries' % (today.strftime('%y%m%d'), today.strftime('%A'), count)
if prefix != '':
subject = '%s %s' % (prefix, subject)
send_mail(sender, receiver, subject, body)
for feed, feedinfo, entry in entries:
entry.sent = datetime.now()
else:
print 'no unmailed digest-entries found... not sending mail.'
2010-10-30 11:16:37 +02:00
2013-03-19 19:32:10 +01:00
def mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix):
2013-03-19 20:09:44 +01:00
subject = '%s' % (entry.title)
if prefix != '':
subject = '%s %s' % (prefix, subject)
2013-03-20 20:30:54 +01:00
link = entry.link
if entry.resolvedlink:
link = entry.resolvedlink
2013-03-19 20:24:49 +01:00
body = '%s\n\n' % get_entry_text(entry)
body = body + '%s\n' % feedinfo.link
2013-03-20 20:30:54 +01:00
body = body + '%s\n' % link
sender[0] = feedinfo.title
2013-03-19 20:09:44 +01:00
send_mail(sender, receiver, subject, body)
2013-03-19 23:03:33 +01:00
entry.sent = datetime.now()
2010-10-30 11:16:37 +02:00
2013-03-19 19:32:10 +01:00
def mail_single_entries(session, sender, receiver, prefix):
print 'mailing single entries...'
2013-03-19 23:03:33 +01:00
count = 0
entries = session.query(Feed, Feedinfo, Entry).\
filter(Feed.id == Feedinfo.feed_id).\
filter(Feed.id == Entry.feed_id).\
filter(Feed.enabled == 1).\
2013-03-22 20:47:20 +01:00
filter(Feed.daily == 0 or Feed.daily == None).\
2013-03-19 23:03:33 +01:00
filter(Entry.sent == None).\
all()
for feed, feedinfo, entry in entries:
2013-03-19 19:32:10 +01:00
mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix)
2013-03-19 23:03:33 +01:00
count = count + 1
if count > 0:
print 'sent %d mails' % count
else:
print 'no unmailed single entries found... not sending mail.'
2010-10-30 11:16:37 +02:00
2010-10-30 00:21:24 +02:00
def fetch_readability(link):
text = hn.upgradeLink(link)
text = text.decode('utf8')
return text
2010-10-29 09:09:25 +02:00
def fetch_full_page(link):
opener = urllib.FancyURLopener({})
response = opener.open(link)
html = response.read()
html = html.decode('utf8')
text = html2text.html2text(html)
return text.encode('latin-1', 'replace')
2010-10-29 09:09:25 +02:00
def process_feed_entry(session, feed, entry):
thisentry = session.query(Entry).\
filter(Entry.title == entry.title).\
filter(Entry.link == entry.link).\
first()
if thisentry:
print ' entry already known <%s>' % entry.title
thisentry.lastfetched = datetime.now()
session.commit()
return 0
else:
print ' new entry <%s>' % entry.title
2010-10-29 09:09:25 +02:00
thisentry = Entry(entry)
2013-03-20 20:30:54 +01:00
if feed.resolveredirects:
print ' fetching final link <%s>' % entry.link
request = urllib2.Request(entry.link)
opener = urllib2.build_opener()
result = opener.open(request)
thisentry.resolvedlink = result.url
print ' final link: <%s>' % result.url
2010-10-30 11:16:37 +02:00
if feed.fullpage:
print ' fetching full page <%s>' % entry.link
2010-10-29 09:09:25 +02:00
thisentry.fullpage = fetch_full_page(entry.link)
2010-10-30 11:16:37 +02:00
if feed.readability:
print ' fetching readability <%s>' % entry.link
2010-10-30 00:21:24 +02:00
thisentry.readability = fetch_readability(entry.link)
if feed.html2textsummary:
print ' converting summary'
h2t = html2text.HTML2Text()
h2t.body_width = 0
if feed.html2textignoreimages:
h2t.ignore_images = True
thisentry.summary = h2t.handle(thisentry.summary)
2010-10-29 09:09:25 +02:00
feed.entry.append(thisentry)
session.commit()
return 1
def fetch_single_feed(session, feed):
2013-03-19 21:18:38 +01:00
print 'processing %s' % feed.url
2010-10-26 23:02:37 +02:00
query = session.query(Feedinfo).filter(Feedinfo.feed_id==feed.id)
2013-03-19 21:18:38 +01:00
fetched = False
2010-10-26 23:02:37 +02:00
try:
feed.feedinfo = query.one()
2013-03-19 21:18:38 +01:00
nextfetch = (feed.feedinfo.lastfetched + timedelta(minutes=feed.frequency))
if datetime.now() > nextfetch:
print 'fetching...'
parser = feedparser.parse(feed.url)
fetched = True
feed.feedinfo.update(parser)
else:
print 'not fetching before: %s' % nextfetch
2010-10-26 23:02:37 +02:00
except Exception, e:
print 'this feed seems to be new'
2013-03-19 21:18:38 +01:00
print 'fetching...'
parser = feedparser.parse(feed.url)
fetched = True
2010-10-26 23:02:37 +02:00
feed.feedinfo = Feedinfo(parser)
2013-03-19 21:18:38 +01:00
if fetched:
print 'processing feed entries:'
entries_new = 0
entries_total = 0
for entry in parser.entries:
entries_total = entries_total + 1
entries_new = entries_new + process_feed_entry(session, feed, entry)
session.commit()
print 'updated %d of %d entries' % (entries_new, entries_total)
def fetch_all_feeds(session):
print 'fetching all feeds...'
for feed in session.query(Feed).filter_by(enabled=1).order_by(Feed.id):
fetch_single_feed(session, feed)
print
if __name__ == '__main__':
config = ConfigParser.ConfigParser()
config.read('atomstrom.conf')
dbconnectstring = '%s://%s:%s@%s/%s?charset=utf8' % (
config.get('database', 'engine'),
config.get('database', 'user'),
config.get('database', 'password'),
config.get('database', 'hostname'),
config.get('database', 'database'),
)
engine = create_engine(dbconnectstring)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
#session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1, 1))
#session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1, 1))
parser = OptionParser()
parser.add_option("-f", "--fetch", action="store_true", dest="fetch", default=False, help="fetch all feeds")
parser.add_option("-s", "--single", action="store_true", dest="single", default=False, help="send single mails")
parser.add_option("-d", "--daily", action="store_true", dest="daily", default=False, help="send daily digest")
(options, args) = parser.parse_args()
if options.fetch:
fetch_all_feeds(session)
if options.single:
sender = [config.get('email', 'sender_name'), config.get('email', 'sender_address')]
receiver = [config.get('email', 'receiver_name'), config.get('email', 'receiver_address')]
2013-03-19 20:09:44 +01:00
prefix = config.get('email', 'prefix_single')
2013-03-19 19:32:10 +01:00
mail_single_entries(session, sender, receiver, prefix)
if options.daily:
sender = [config.get('email', 'sender_name'), config.get('email', 'sender_address')]
receiver = [config.get('email', 'receiver_name'), config.get('email', 'receiver_address')]
2013-03-19 19:32:10 +01:00
prefix = config.get('email', 'prefix_digest')
mail_daily_digest(session, sender, receiver, prefix)
2013-03-19 07:15:24 +01:00
if not (options.fetch or options.single or options.daily):
parser.print_help()
2010-10-26 23:02:37 +02:00
session.commit()