2010-10-26 23:02:37 +02:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
2010-10-31 13:30:26 +01:00
|
|
|
from sqlalchemy import create_engine, Table, Column, Integer, Text, Boolean, DateTime, MetaData, ForeignKey, desc
|
2010-10-26 23:02:37 +02:00
|
|
|
from sqlalchemy.orm import sessionmaker, relation, backref
|
|
|
|
from sqlalchemy.ext.declarative import declarative_base
|
2013-03-19 21:18:38 +01:00
|
|
|
from datetime import datetime, timedelta
|
2013-03-18 20:54:00 +01:00
|
|
|
from time import mktime
|
2010-10-26 23:02:37 +02:00
|
|
|
import feedparser
|
|
|
|
import re
|
2010-10-29 00:13:00 +02:00
|
|
|
import sys
|
2010-10-29 09:09:25 +02:00
|
|
|
import urllib
|
2010-10-30 00:21:24 +02:00
|
|
|
import hn
|
2010-10-31 13:30:26 +01:00
|
|
|
import html2text
|
2013-03-18 19:28:47 +01:00
|
|
|
import ConfigParser
|
2013-03-19 21:18:38 +01:00
|
|
|
import pprint
|
2013-03-19 20:09:44 +01:00
|
|
|
import smtplib
|
2013-03-19 20:19:43 +01:00
|
|
|
from email.mime.text import MIMEText
|
2010-10-31 19:46:13 +01:00
|
|
|
from optparse import OptionParser
|
2010-10-26 23:02:37 +02:00
|
|
|
|
|
|
|
Base = declarative_base()
|
|
|
|
|
|
|
|
class Feed(Base):
|
|
|
|
__tablename__ = 'feed'
|
|
|
|
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
url = Column(Text)
|
2013-03-19 21:18:38 +01:00
|
|
|
frequency = Column(Integer)
|
2010-10-26 23:02:37 +02:00
|
|
|
daily = Column(Boolean)
|
|
|
|
readability = Column(Boolean)
|
2010-10-29 00:13:00 +02:00
|
|
|
fullpage = Column(Boolean)
|
2010-10-31 14:20:12 +01:00
|
|
|
html2textsummary = Column(Boolean)
|
2010-10-26 23:02:37 +02:00
|
|
|
enabled = Column(Boolean)
|
|
|
|
|
2010-10-31 14:20:12 +01:00
|
|
|
def __init__(self, url, daily, readability, fullpage, enabled, html2textsummary):
|
2010-10-26 23:02:37 +02:00
|
|
|
self.url = url
|
|
|
|
self.daily = daily
|
|
|
|
self.readability = readability
|
2010-10-29 00:13:00 +02:00
|
|
|
self.fullpage = fullpage
|
2010-10-31 14:20:12 +01:00
|
|
|
self.html2textsummary = html2textsummary
|
2010-10-26 23:02:37 +02:00
|
|
|
self.enabled = enabled
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return "<Feed('%s','%s','%s')>" % (self.url, self.daily, self.readability)
|
|
|
|
|
|
|
|
|
|
|
|
class Feedinfo(Base):
|
|
|
|
__tablename__ = 'feedinfo'
|
|
|
|
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
feed_id = Column(Integer, ForeignKey('feed.id'))
|
|
|
|
feed = relation("Feed", backref=backref('feedinfo', uselist=False))
|
|
|
|
title = Column(Text)
|
|
|
|
link = Column(Text)
|
|
|
|
subtitle = Column(Text)
|
|
|
|
author = Column(Text)
|
|
|
|
publisher = Column(Text)
|
|
|
|
status = Column(Integer)
|
|
|
|
version = Column(Text)
|
|
|
|
encoding = Column(Text)
|
|
|
|
bozo = Column(Integer)
|
2010-10-29 00:13:00 +02:00
|
|
|
|
2010-10-26 23:02:37 +02:00
|
|
|
lastfetched = Column(DateTime)
|
|
|
|
lastsuccessful = Column(DateTime)
|
|
|
|
|
|
|
|
def __init__(self, parser):
|
|
|
|
self.update(parser)
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return "<Feedinfo('%s','%s','%s')>" % (self.title, self.subtitle, self.author)
|
|
|
|
|
|
|
|
def update(self, parser):
|
|
|
|
if parser.feed.has_key('title'):
|
|
|
|
self.title = parser.feed.get('title').encode('latin-1', 'replace')
|
|
|
|
if parser.feed.has_key('link'):
|
|
|
|
self.link = parser.feed.get('link')
|
|
|
|
if parser.feed.has_key('subtitle'):
|
|
|
|
self.subtitle = parser.feed.get('subtitle').encode('latin-1', 'replace')
|
|
|
|
if parser.feed.has_key('author'):
|
|
|
|
self.author = parser.feed.get('author').encode('latin-1', 'replace')
|
|
|
|
if parser.feed.has_key('publisher'):
|
|
|
|
self.author = parser.feed.get('publisher').encode('latin-1', 'replace')
|
|
|
|
self.status = parser.get('status')
|
|
|
|
self.version = parser.get('version')
|
|
|
|
self.encoding = parser.get('encoding')
|
|
|
|
self.bozo = parser.get('bozo')
|
2013-03-18 20:54:00 +01:00
|
|
|
self.lastfetched = datetime.now()
|
2013-03-19 22:40:47 +01:00
|
|
|
if parser.get('status') == 200 or parser.get('status') == 302:
|
2013-03-18 20:54:00 +01:00
|
|
|
self.lastsuccessful = datetime.now()
|
2010-10-26 23:02:37 +02:00
|
|
|
|
|
|
|
|
|
|
|
class Entry(Base):
|
|
|
|
__tablename__ = 'entry'
|
|
|
|
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
feed_id = Column(Integer, ForeignKey('feed.id'))
|
|
|
|
feed = relation("Feed", backref=backref('entry'))
|
|
|
|
title = Column(Text)
|
|
|
|
link = Column(Text)
|
|
|
|
summary = Column(Text)
|
|
|
|
content = Column(Text)
|
|
|
|
author = Column(Text)
|
|
|
|
enclosures = Column(Text)
|
2010-10-29 00:13:00 +02:00
|
|
|
|
|
|
|
fullpage = Column(Text)
|
2010-10-30 00:21:24 +02:00
|
|
|
readability = Column(Text)
|
2013-03-18 20:54:00 +01:00
|
|
|
updated = Column(DateTime)
|
2010-10-30 11:39:19 +02:00
|
|
|
firstfetched = Column(DateTime)
|
2010-10-26 23:02:37 +02:00
|
|
|
lastfetched = Column(DateTime)
|
2010-10-29 00:13:00 +02:00
|
|
|
sent = Column(DateTime)
|
2010-10-26 23:02:37 +02:00
|
|
|
|
|
|
|
def __init__(self, entry):
|
|
|
|
self.update(entry)
|
2013-03-18 20:54:00 +01:00
|
|
|
self.firstfetched = datetime.now()
|
2010-10-26 23:02:37 +02:00
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return "<Entry('%s','%s','%s')>" % (self.title, "", "")
|
|
|
|
|
|
|
|
def update(self, entry):
|
|
|
|
if entry.has_key('title'):
|
|
|
|
self.title = entry.get('title').encode('latin-1', 'replace')
|
|
|
|
if entry.has_key('link'):
|
|
|
|
self.link = entry.get('link').encode('latin-1', 'replace')
|
|
|
|
if entry.has_key('summary'):
|
|
|
|
self.summary = entry.get('summary').encode('latin-1', 'replace')
|
2013-03-19 22:40:47 +01:00
|
|
|
if entry.has_key('content'):
|
|
|
|
self.content = entry.get('content')[0].value.encode('latin-1', 'replace')
|
2010-10-26 23:02:37 +02:00
|
|
|
if entry.has_key('author'):
|
|
|
|
self.author = entry.get('author').encode('latin-1', 'replace')
|
2013-03-18 20:54:00 +01:00
|
|
|
if entry.has_key('updated_parsed'):
|
|
|
|
updated_parsed = entry.get('updated_parsed')
|
|
|
|
self.updated = datetime.fromtimestamp(mktime(updated_parsed))
|
2013-03-19 22:40:47 +01:00
|
|
|
if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0:
|
|
|
|
print 'enclosures';
|
|
|
|
pp=pprint.PrettyPrinter(depth=4)
|
|
|
|
pp.pprint(entry.get('enclosures'))
|
|
|
|
#self.enclosures = entry.get('enclosures').encode('latin-1', 'replace')
|
2013-03-18 20:54:00 +01:00
|
|
|
self.lastfetched = datetime.now()
|
2010-10-26 23:02:37 +02:00
|
|
|
|
|
|
|
|
2013-03-19 20:09:44 +01:00
|
|
|
def send_mail(sender, receiver, subject, body):
|
|
|
|
print 'sending to %s: %s' % (receiver.decode('latin-1'), subject.decode('latin-1'))
|
2013-03-19 20:19:43 +01:00
|
|
|
mail = MIMEText(body)
|
|
|
|
mail['From'] = sender
|
|
|
|
mail['To'] = receiver
|
|
|
|
mail['Subject'] = subject
|
|
|
|
mailserver = smtplib.SMTP('localhost')
|
|
|
|
mailserver.sendmail(sender, [receiver], mail.as_string())
|
|
|
|
mailserver.quit()
|
|
|
|
|
2010-10-30 11:16:37 +02:00
|
|
|
|
2010-10-30 11:39:19 +02:00
|
|
|
def get_entry_text(entry):
|
|
|
|
if entry.readability:
|
|
|
|
text = entry.readability
|
|
|
|
elif entry.fullpage:
|
|
|
|
text = entry.fullpage
|
|
|
|
elif entry.summary:
|
|
|
|
text = entry.summary
|
|
|
|
else:
|
|
|
|
text = 'no text, sorry'
|
|
|
|
return text
|
|
|
|
|
2013-03-19 19:32:10 +01:00
|
|
|
def mail_daily_digest(session, sender, receiver, prefix):
|
2010-10-30 11:39:19 +02:00
|
|
|
print 'mailing daily digest...'
|
2013-03-19 23:03:33 +01:00
|
|
|
entries = session.query(Feed, Feedinfo, Entry).\
|
|
|
|
filter(Feed.id == Feedinfo.feed_id).\
|
|
|
|
filter(Feed.id == Entry.feed_id).\
|
|
|
|
filter(Feed.enabled == 1).\
|
|
|
|
filter(Feed.daily == 1).\
|
|
|
|
filter(Entry.sent == None).\
|
|
|
|
order_by(desc(Entry.firstfetched), Feedinfo.title, Entry.title).\
|
|
|
|
all()
|
2010-10-30 11:39:19 +02:00
|
|
|
body = ''
|
2010-10-31 11:05:28 +01:00
|
|
|
count = 0
|
2013-03-19 23:03:33 +01:00
|
|
|
for feed, feedinfo, entry in entries:
|
2010-10-31 11:05:28 +01:00
|
|
|
count = count + 1
|
2013-03-19 23:03:33 +01:00
|
|
|
body = body + '=> %s - %s\n' % (entry.firstfetched.strftime('%y%m%d-%H%M'), feedinfo.title)
|
2010-10-31 11:05:28 +01:00
|
|
|
body = body + ' %s\n' % entry.title
|
2013-03-19 20:24:49 +01:00
|
|
|
body = body + '%s\n' % get_entry_text(entry)[0:100]
|
|
|
|
body = body + '%s\n\n' % entry.link
|
2013-03-19 23:03:33 +01:00
|
|
|
if count > 0:
|
|
|
|
today = datetime.now()
|
|
|
|
subject = '%s (%s) - %d entries' % (today.strftime('%y%m%d'), today.strftime('%A'), count)
|
|
|
|
if prefix != '':
|
|
|
|
subject = '%s %s' % (prefix, subject)
|
|
|
|
send_mail(sender, receiver, subject, body)
|
|
|
|
for feed, feedinfo, entry in entries:
|
|
|
|
entry.sent = datetime.now()
|
|
|
|
else:
|
|
|
|
print 'no unmailed digest-entries found... not sending mail.'
|
2010-10-30 11:16:37 +02:00
|
|
|
|
2013-03-19 19:32:10 +01:00
|
|
|
def mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix):
|
2013-03-19 20:09:44 +01:00
|
|
|
subject = '%s' % (entry.title)
|
|
|
|
if prefix != '':
|
|
|
|
subject = '%s %s' % (prefix, subject)
|
2013-03-19 20:24:49 +01:00
|
|
|
body = '%s\n\n' % get_entry_text(entry)
|
|
|
|
body = body + '%s\n' % feedinfo.link
|
|
|
|
body = body + '%s\n' % entry.link
|
2013-03-19 20:09:44 +01:00
|
|
|
send_mail(sender, receiver, subject, body)
|
2013-03-19 23:03:33 +01:00
|
|
|
entry.sent = datetime.now()
|
2010-10-30 11:16:37 +02:00
|
|
|
|
2013-03-19 19:32:10 +01:00
|
|
|
def mail_single_entries(session, sender, receiver, prefix):
|
2010-10-30 11:39:19 +02:00
|
|
|
print 'mailing single entries...'
|
2013-03-19 23:03:33 +01:00
|
|
|
count = 0
|
|
|
|
entries = session.query(Feed, Feedinfo, Entry).\
|
|
|
|
filter(Feed.id == Feedinfo.feed_id).\
|
|
|
|
filter(Feed.id == Entry.feed_id).\
|
|
|
|
filter(Feed.enabled == 1).\
|
|
|
|
filter(Feed.daily == 0).\
|
|
|
|
filter(Entry.sent == None).\
|
|
|
|
all()
|
|
|
|
for feed, feedinfo, entry in entries:
|
2013-03-19 19:32:10 +01:00
|
|
|
mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix)
|
2013-03-19 23:03:33 +01:00
|
|
|
count = count + 1
|
|
|
|
if count > 0:
|
|
|
|
print 'sent %d mails' % count
|
|
|
|
else:
|
|
|
|
print 'no unmailed single entries found... not sending mail.'
|
2010-10-30 11:16:37 +02:00
|
|
|
|
2010-10-30 00:21:24 +02:00
|
|
|
def fetch_readability(link):
|
|
|
|
text = hn.upgradeLink(link)
|
|
|
|
text = text.decode('utf8')
|
|
|
|
return text
|
|
|
|
|
2010-10-29 09:09:25 +02:00
|
|
|
def fetch_full_page(link):
|
|
|
|
opener = urllib.FancyURLopener({})
|
|
|
|
response = opener.open(link)
|
2010-10-31 13:30:26 +01:00
|
|
|
html = response.read()
|
|
|
|
html = html.decode('utf8')
|
|
|
|
text = html2text.html2text(html)
|
|
|
|
return text.encode('latin-1', 'replace')
|
2010-10-29 09:09:25 +02:00
|
|
|
|
2013-03-18 19:28:47 +01:00
|
|
|
def process_feed_entry(session, feed, entry):
|
2010-10-31 14:20:12 +01:00
|
|
|
#query = session.query(Entry).filter_by(feed_id=feed.id, title=entry.title.encode('latin-1', 'replace'))
|
|
|
|
title = entry.title.encode('latin-1', 'replace')
|
|
|
|
link = entry.link.encode('latin-1', 'replace')
|
|
|
|
query = session.query(Entry).filter(Entry.feed_id==feed.id).filter(Entry.title==title).filter(Entry.link==link)
|
2010-10-29 00:28:25 +02:00
|
|
|
try:
|
|
|
|
thisentry = query.one()
|
|
|
|
thisentry.update(entry)
|
2010-10-31 13:30:26 +01:00
|
|
|
print ' entry already known <%s>' % entry.title
|
|
|
|
return 0
|
2010-10-29 00:28:25 +02:00
|
|
|
except Exception, e:
|
2010-10-31 13:30:26 +01:00
|
|
|
print ' new entry <%s>' % entry.title
|
2010-10-29 09:09:25 +02:00
|
|
|
thisentry = Entry(entry)
|
2010-10-30 11:16:37 +02:00
|
|
|
if feed.fullpage:
|
2010-10-31 13:30:26 +01:00
|
|
|
print ' fetching full page <%s>' % entry.link
|
2010-10-29 09:09:25 +02:00
|
|
|
thisentry.fullpage = fetch_full_page(entry.link)
|
2010-10-30 11:16:37 +02:00
|
|
|
if feed.readability:
|
2010-10-31 13:30:26 +01:00
|
|
|
print ' fetching readability <%s>' % entry.link
|
2010-10-30 00:21:24 +02:00
|
|
|
thisentry.readability = fetch_readability(entry.link)
|
2010-10-31 14:20:12 +01:00
|
|
|
if feed.html2textsummary:
|
|
|
|
print ' converting summary'
|
|
|
|
summary = thisentry.summary.decode('latin-1')
|
|
|
|
summary = html2text.html2text(summary)
|
|
|
|
thisentry.summary = summary.encode('latin-1', 'replace')
|
2010-10-29 09:09:25 +02:00
|
|
|
feed.entry.append(thisentry)
|
2010-10-31 13:30:26 +01:00
|
|
|
return 1
|
2010-10-29 00:28:25 +02:00
|
|
|
|
2013-03-18 19:28:47 +01:00
|
|
|
def fetch_single_feed(session, feed):
|
2013-03-19 21:18:38 +01:00
|
|
|
print 'processing %s' % feed.url
|
2010-10-26 23:02:37 +02:00
|
|
|
query = session.query(Feedinfo).filter(Feedinfo.feed_id==feed.id)
|
2013-03-19 21:18:38 +01:00
|
|
|
fetched = False
|
2010-10-26 23:02:37 +02:00
|
|
|
try:
|
|
|
|
feed.feedinfo = query.one()
|
2013-03-19 21:18:38 +01:00
|
|
|
nextfetch = (feed.feedinfo.lastfetched + timedelta(minutes=feed.frequency))
|
|
|
|
if datetime.now() > nextfetch:
|
|
|
|
print 'fetching...'
|
|
|
|
parser = feedparser.parse(feed.url)
|
|
|
|
fetched = True
|
|
|
|
feed.feedinfo.update(parser)
|
|
|
|
else:
|
|
|
|
print 'not fetching before: %s' % nextfetch
|
2010-10-26 23:02:37 +02:00
|
|
|
except Exception, e:
|
2010-10-31 13:30:26 +01:00
|
|
|
print 'this feed seems to be new'
|
2013-03-19 21:18:38 +01:00
|
|
|
print 'fetching...'
|
|
|
|
parser = feedparser.parse(feed.url)
|
|
|
|
fetched = True
|
2010-10-26 23:02:37 +02:00
|
|
|
feed.feedinfo = Feedinfo(parser)
|
|
|
|
|
2013-03-19 21:18:38 +01:00
|
|
|
if fetched:
|
|
|
|
print 'processing feed entries:'
|
|
|
|
entries_new = 0
|
|
|
|
entries_total = 0
|
|
|
|
for entry in parser.entries:
|
|
|
|
entries_total = entries_total + 1
|
|
|
|
entries_new = entries_new + process_feed_entry(session, feed, entry)
|
|
|
|
session.commit()
|
|
|
|
print 'updated %d of %d entries' % (entries_new, entries_total)
|
2010-10-29 00:28:25 +02:00
|
|
|
|
2013-03-18 19:28:47 +01:00
|
|
|
def fetch_all_feeds(session):
|
2010-10-30 11:39:19 +02:00
|
|
|
print 'fetching all feeds...'
|
2010-10-29 00:28:25 +02:00
|
|
|
for feed in session.query(Feed).filter_by(enabled=1).order_by(Feed.id):
|
2013-03-18 19:28:47 +01:00
|
|
|
fetch_single_feed(session, feed)
|
2010-10-29 00:28:25 +02:00
|
|
|
print
|
|
|
|
|
2010-10-30 11:39:19 +02:00
|
|
|
if __name__ == '__main__':
|
2013-03-18 19:28:47 +01:00
|
|
|
config = ConfigParser.ConfigParser()
|
|
|
|
config.read('atomstrom.conf')
|
|
|
|
|
|
|
|
dbconnectstring = '%s://%s:%s@%s/%s' % (
|
|
|
|
config.get('database', 'engine'),
|
|
|
|
config.get('database', 'user'),
|
|
|
|
config.get('database', 'password'),
|
|
|
|
config.get('database', 'hostname'),
|
|
|
|
config.get('database', 'database'),
|
|
|
|
)
|
|
|
|
engine = create_engine(dbconnectstring)
|
|
|
|
Base.metadata.create_all(engine)
|
|
|
|
|
|
|
|
Session = sessionmaker(bind=engine)
|
|
|
|
session = Session()
|
|
|
|
|
|
|
|
#session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1, 1))
|
|
|
|
#session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1, 1))
|
|
|
|
|
2010-10-31 19:46:13 +01:00
|
|
|
parser = OptionParser()
|
|
|
|
parser.add_option("-f", "--fetch", action="store_true", dest="fetch", default=False, help="fetch all feeds")
|
|
|
|
parser.add_option("-s", "--single", action="store_true", dest="single", default=False, help="send single mails")
|
|
|
|
parser.add_option("-d", "--daily", action="store_true", dest="daily", default=False, help="send daily digest")
|
|
|
|
(options, args) = parser.parse_args()
|
|
|
|
|
|
|
|
if options.fetch:
|
2013-03-18 19:28:47 +01:00
|
|
|
fetch_all_feeds(session)
|
2010-10-31 19:46:13 +01:00
|
|
|
if options.single:
|
2013-03-19 19:32:10 +01:00
|
|
|
sender = config.get('email', 'sender')
|
|
|
|
receiver = config.get('email', 'receiver')
|
2013-03-19 20:09:44 +01:00
|
|
|
prefix = config.get('email', 'prefix_single')
|
2013-03-19 19:32:10 +01:00
|
|
|
mail_single_entries(session, sender, receiver, prefix)
|
2010-10-31 19:46:13 +01:00
|
|
|
if options.daily:
|
2013-03-19 19:32:10 +01:00
|
|
|
sender = config.get('email', 'sender')
|
|
|
|
receiver = config.get('email', 'receiver')
|
|
|
|
prefix = config.get('email', 'prefix_digest')
|
|
|
|
mail_daily_digest(session, sender, receiver, prefix)
|
2013-03-19 07:15:24 +01:00
|
|
|
if not (options.fetch or options.single or options.daily):
|
|
|
|
parser.print_help()
|
2010-10-26 23:02:37 +02:00
|
|
|
|
2013-03-18 19:28:47 +01:00
|
|
|
session.commit()
|