2010-10-26 23:02:37 +02:00
|
|
|
#!/usr/bin/env python
|
2013-03-22 22:05:15 +01:00
|
|
|
#coding: utf-8
|
2010-10-26 23:02:37 +02:00
|
|
|
|
2013-03-26 20:20:51 +01:00
|
|
|
from models import Base, Feed, Feedinfo, Entry
|
|
|
|
|
2013-04-03 23:31:57 +02:00
|
|
|
from sqlalchemy import create_engine, desc, func
|
2013-03-26 20:20:51 +01:00
|
|
|
from sqlalchemy.orm import sessionmaker
|
2013-03-28 23:35:20 +01:00
|
|
|
from datetime import datetime
|
2013-04-03 20:35:08 +02:00
|
|
|
from ddate import ddate
|
2010-10-26 23:02:37 +02:00
|
|
|
import feedparser
|
2010-10-29 00:13:00 +02:00
|
|
|
import sys
|
2013-03-26 20:20:51 +01:00
|
|
|
import codecs
|
|
|
|
#import urllib
|
2013-03-20 20:30:54 +01:00
|
|
|
import urllib2
|
2013-03-26 20:20:51 +01:00
|
|
|
#import hn
|
2010-10-31 13:30:26 +01:00
|
|
|
import html2text
|
2013-03-18 19:28:47 +01:00
|
|
|
import ConfigParser
|
2013-04-03 22:49:53 +02:00
|
|
|
from argparse import ArgumentParser
|
2013-03-22 22:05:15 +01:00
|
|
|
from cStringIO import StringIO
|
|
|
|
from email.mime.multipart import MIMEMultipart
|
|
|
|
from email.mime.text import MIMEText
|
|
|
|
from email.header import Header
|
|
|
|
from email import Charset
|
|
|
|
from email.generator import Generator
|
|
|
|
import smtplib
|
2010-10-26 23:02:37 +02:00
|
|
|
|
2013-03-19 20:09:44 +01:00
|
|
|
def send_mail(sender, receiver, subject, body):
|
2013-03-22 22:05:15 +01:00
|
|
|
print 'sending to %s: %s' % (receiver[0], subject)
|
|
|
|
Charset.add_charset('utf-8', Charset.QP, Charset.QP, 'utf-8')
|
|
|
|
mail = MIMEMultipart('alternative')
|
|
|
|
mail['Subject'] = "%s" % Header(subject, 'utf-8')
|
|
|
|
mail['From'] = "\"%s\" <%s>" % (Header(sender[0], 'utf-8'), sender[1])
|
|
|
|
mail['To'] = "\"%s\" <%s>" % (Header(receiver[0], 'utf-8'), receiver[1])
|
|
|
|
textpart = MIMEText(body, 'plain', 'utf-8')
|
|
|
|
mail.attach(textpart)
|
|
|
|
str_io = StringIO()
|
|
|
|
gen = Generator(str_io, False)
|
|
|
|
gen.flatten(mail)
|
|
|
|
s = smtplib.SMTP('localhost')
|
|
|
|
s.sendmail("", receiver[1], str_io.getvalue())
|
2010-10-30 11:16:37 +02:00
|
|
|
|
2013-03-27 22:34:11 +01:00
|
|
|
def truncate_text(content, length=100, suffix='...'):
|
|
|
|
content = " ".join(content.split())
|
|
|
|
if len(content) <= length:
|
|
|
|
return content
|
|
|
|
else:
|
|
|
|
return content[:length].rsplit(' ', 1)[0]+suffix
|
|
|
|
|
2013-03-19 19:32:10 +01:00
|
|
|
def mail_daily_digest(session, sender, receiver, prefix):
|
2010-10-30 11:39:19 +02:00
|
|
|
print 'mailing daily digest...'
|
2013-03-19 23:03:33 +01:00
|
|
|
entries = session.query(Feed, Feedinfo, Entry).\
|
|
|
|
filter(Feed.id == Feedinfo.feed_id).\
|
|
|
|
filter(Feed.id == Entry.feed_id).\
|
|
|
|
filter(Feed.enabled == 1).\
|
|
|
|
filter(Feed.daily == 1).\
|
|
|
|
filter(Entry.sent == None).\
|
|
|
|
order_by(desc(Entry.firstfetched), Feedinfo.title, Entry.title).\
|
|
|
|
all()
|
2010-10-30 11:39:19 +02:00
|
|
|
body = ''
|
2010-10-31 11:05:28 +01:00
|
|
|
count = 0
|
2013-03-19 23:03:33 +01:00
|
|
|
for feed, feedinfo, entry in entries:
|
2010-10-31 11:05:28 +01:00
|
|
|
count = count + 1
|
2013-03-20 20:30:54 +01:00
|
|
|
link = entry.link
|
|
|
|
if entry.resolvedlink:
|
|
|
|
link = entry.resolvedlink
|
2013-03-21 21:42:28 +01:00
|
|
|
try:
|
|
|
|
body = body + '=> %s - %s\n' % (entry.firstfetched.strftime('%y%m%d-%H%M'), feedinfo.title)
|
2013-04-03 20:35:08 +02:00
|
|
|
body = body + '>> %s\n' % entry.title
|
2013-04-03 00:06:06 +02:00
|
|
|
body = body + '%s\n' % truncate_text(entry.get_text(), 250)
|
2013-03-21 21:42:28 +01:00
|
|
|
body = body + '%s\n\n' % link
|
|
|
|
except:
|
|
|
|
print 'ERROR processing entry %s' % entry.id;
|
|
|
|
print sys.exc_info()
|
|
|
|
print 'not sending mail'
|
|
|
|
return
|
2013-03-19 23:03:33 +01:00
|
|
|
if count > 0:
|
|
|
|
today = datetime.now()
|
|
|
|
subject = '%s (%s) - %d entries' % (today.strftime('%y%m%d'), today.strftime('%A'), count)
|
2013-04-03 20:35:08 +02:00
|
|
|
body = '%s\n\n%s\n\n%s' % (subject, ddate(), body)
|
2013-03-19 23:03:33 +01:00
|
|
|
if prefix != '':
|
|
|
|
subject = '%s %s' % (prefix, subject)
|
|
|
|
send_mail(sender, receiver, subject, body)
|
|
|
|
for feed, feedinfo, entry in entries:
|
|
|
|
entry.sent = datetime.now()
|
|
|
|
else:
|
|
|
|
print 'no unmailed digest-entries found... not sending mail.'
|
2010-10-30 11:16:37 +02:00
|
|
|
|
2013-03-19 19:32:10 +01:00
|
|
|
def mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix):
|
2013-03-19 20:09:44 +01:00
|
|
|
subject = '%s' % (entry.title)
|
|
|
|
if prefix != '':
|
|
|
|
subject = '%s %s' % (prefix, subject)
|
2013-03-20 20:30:54 +01:00
|
|
|
link = entry.link
|
|
|
|
if entry.resolvedlink:
|
|
|
|
link = entry.resolvedlink
|
2013-04-03 00:06:06 +02:00
|
|
|
body = '%s\n\n' % entry.get_text()
|
2013-03-19 20:24:49 +01:00
|
|
|
body = body + '%s\n' % feedinfo.link
|
2013-03-20 20:30:54 +01:00
|
|
|
body = body + '%s\n' % link
|
2013-03-22 22:05:15 +01:00
|
|
|
sender[0] = feedinfo.title
|
2013-03-19 20:09:44 +01:00
|
|
|
send_mail(sender, receiver, subject, body)
|
2013-03-19 23:03:33 +01:00
|
|
|
entry.sent = datetime.now()
|
2010-10-30 11:16:37 +02:00
|
|
|
|
2013-03-19 19:32:10 +01:00
|
|
|
def mail_single_entries(session, sender, receiver, prefix):
|
2010-10-30 11:39:19 +02:00
|
|
|
print 'mailing single entries...'
|
2013-03-19 23:03:33 +01:00
|
|
|
count = 0
|
|
|
|
entries = session.query(Feed, Feedinfo, Entry).\
|
|
|
|
filter(Feed.id == Feedinfo.feed_id).\
|
|
|
|
filter(Feed.id == Entry.feed_id).\
|
|
|
|
filter(Feed.enabled == 1).\
|
2013-03-22 20:47:20 +01:00
|
|
|
filter(Feed.daily == 0 or Feed.daily == None).\
|
2013-03-19 23:03:33 +01:00
|
|
|
filter(Entry.sent == None).\
|
|
|
|
all()
|
|
|
|
for feed, feedinfo, entry in entries:
|
2013-03-19 19:32:10 +01:00
|
|
|
mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix)
|
2013-03-19 23:03:33 +01:00
|
|
|
count = count + 1
|
|
|
|
if count > 0:
|
|
|
|
print 'sent %d mails' % count
|
|
|
|
else:
|
|
|
|
print 'no unmailed single entries found... not sending mail.'
|
2010-10-30 11:16:37 +02:00
|
|
|
|
2010-10-30 00:21:24 +02:00
|
|
|
def fetch_readability(link):
|
|
|
|
text = hn.upgradeLink(link)
|
|
|
|
text = text.decode('utf8')
|
|
|
|
return text
|
|
|
|
|
2010-10-29 09:09:25 +02:00
|
|
|
def fetch_full_page(link):
|
|
|
|
opener = urllib.FancyURLopener({})
|
|
|
|
response = opener.open(link)
|
2010-10-31 13:30:26 +01:00
|
|
|
html = response.read()
|
|
|
|
html = html.decode('utf8')
|
|
|
|
text = html2text.html2text(html)
|
|
|
|
return text.encode('latin-1', 'replace')
|
2010-10-29 09:09:25 +02:00
|
|
|
|
2013-03-18 19:28:47 +01:00
|
|
|
def process_feed_entry(session, feed, entry):
|
2013-03-21 22:38:44 +01:00
|
|
|
thisentry = session.query(Entry).\
|
|
|
|
filter(Entry.title == entry.title).\
|
|
|
|
filter(Entry.link == entry.link).\
|
|
|
|
first()
|
|
|
|
if thisentry:
|
2010-10-31 13:30:26 +01:00
|
|
|
print ' entry already known <%s>' % entry.title
|
2013-03-21 22:38:44 +01:00
|
|
|
thisentry.lastfetched = datetime.now()
|
|
|
|
session.commit()
|
2010-10-31 13:30:26 +01:00
|
|
|
return 0
|
2013-03-21 22:38:44 +01:00
|
|
|
else:
|
2013-03-28 22:49:51 +01:00
|
|
|
print ' new entry <%s>' % entry.title
|
2010-10-29 09:09:25 +02:00
|
|
|
thisentry = Entry(entry)
|
2013-03-20 20:30:54 +01:00
|
|
|
if feed.resolveredirects:
|
|
|
|
print ' fetching final link <%s>' % entry.link
|
|
|
|
request = urllib2.Request(entry.link)
|
|
|
|
opener = urllib2.build_opener()
|
|
|
|
result = opener.open(request)
|
|
|
|
thisentry.resolvedlink = result.url
|
|
|
|
print ' final link: <%s>' % result.url
|
2010-10-30 11:16:37 +02:00
|
|
|
if feed.fullpage:
|
2010-10-31 13:30:26 +01:00
|
|
|
print ' fetching full page <%s>' % entry.link
|
2010-10-29 09:09:25 +02:00
|
|
|
thisentry.fullpage = fetch_full_page(entry.link)
|
2010-10-30 11:16:37 +02:00
|
|
|
if feed.readability:
|
2010-10-31 13:30:26 +01:00
|
|
|
print ' fetching readability <%s>' % entry.link
|
2010-10-30 00:21:24 +02:00
|
|
|
thisentry.readability = fetch_readability(entry.link)
|
2013-04-03 00:06:06 +02:00
|
|
|
if feed.html2textcontent:
|
2010-10-31 14:20:12 +01:00
|
|
|
print ' converting summary'
|
2013-03-20 23:20:54 +01:00
|
|
|
h2t = html2text.HTML2Text()
|
|
|
|
h2t.body_width = 0
|
2013-04-03 22:29:40 +02:00
|
|
|
h2t.inline_links = False
|
2013-03-20 23:28:48 +01:00
|
|
|
if feed.html2textignoreimages:
|
|
|
|
h2t.ignore_images = True
|
2013-04-03 00:06:06 +02:00
|
|
|
if feed.contentcolumn == 'summary':
|
|
|
|
thisentry.summary = h2t.handle(thisentry.summary)
|
|
|
|
elif feed.contentcolumn == 'content':
|
|
|
|
thisentry.content = h2t.handle(thisentry.content)
|
|
|
|
elif feed.contentcolumn == 'fullpage':
|
|
|
|
thisentry.fullpage = h2t.handle(thisentry.fullpage)
|
|
|
|
elif feed.contentcolumn == 'readability':
|
|
|
|
thisentry.readability = h2t.handle(thisentry.readability)
|
2010-10-29 09:09:25 +02:00
|
|
|
feed.entry.append(thisentry)
|
2013-03-21 22:38:44 +01:00
|
|
|
session.commit()
|
2010-10-31 13:30:26 +01:00
|
|
|
return 1
|
2010-10-29 00:28:25 +02:00
|
|
|
|
2013-03-18 19:28:47 +01:00
|
|
|
def fetch_single_feed(session, feed):
|
2013-03-19 21:18:38 +01:00
|
|
|
print 'processing %s' % feed.url
|
2013-03-26 21:26:59 +01:00
|
|
|
thisfeedinfo = session.query(Feedinfo).\
|
|
|
|
filter(Feedinfo.feed_id==feed.id).\
|
|
|
|
first()
|
2013-03-19 21:18:38 +01:00
|
|
|
fetched = False
|
2013-03-26 21:26:59 +01:00
|
|
|
if thisfeedinfo:
|
|
|
|
feed.feedinfo = thisfeedinfo
|
2013-03-28 23:35:20 +01:00
|
|
|
if (not feed.feedinfo.nextfetch) or (feed.feedinfo.nextfetch < datetime.now()):
|
2013-03-26 21:26:59 +01:00
|
|
|
print 'feed known, fetching...'
|
2013-03-27 20:25:37 +01:00
|
|
|
try:
|
|
|
|
parser = feedparser.parse(feed.url)
|
|
|
|
fetched = True
|
|
|
|
feed.feedinfo.update(parser)
|
|
|
|
except:
|
|
|
|
print 'ERROR parsing feed'
|
|
|
|
print sys.exc_info()
|
2013-03-19 21:18:38 +01:00
|
|
|
else:
|
2013-03-28 23:35:20 +01:00
|
|
|
print 'not fetching before: %s' % feed.feedinfo.nextfetch
|
2013-03-26 21:26:59 +01:00
|
|
|
else:
|
|
|
|
print 'feed seems to be new, fetching...'
|
2013-03-27 20:25:37 +01:00
|
|
|
try:
|
|
|
|
parser = feedparser.parse(feed.url)
|
|
|
|
fetched = True
|
|
|
|
feed.feedinfo = Feedinfo(parser)
|
|
|
|
except:
|
|
|
|
print 'ERROR parsing feed'
|
|
|
|
print sys.exc_info()
|
2010-10-26 23:02:37 +02:00
|
|
|
|
2013-03-19 21:18:38 +01:00
|
|
|
if fetched:
|
|
|
|
print 'processing feed entries:'
|
|
|
|
entries_new = 0
|
|
|
|
entries_total = 0
|
|
|
|
for entry in parser.entries:
|
|
|
|
entries_total = entries_total + 1
|
|
|
|
entries_new = entries_new + process_feed_entry(session, feed, entry)
|
|
|
|
session.commit()
|
|
|
|
print 'updated %d of %d entries' % (entries_new, entries_total)
|
2010-10-29 00:28:25 +02:00
|
|
|
|
2013-04-03 23:31:57 +02:00
|
|
|
def list_all_feeds(session):
|
|
|
|
allfeeds = session.query(Feed).\
|
|
|
|
order_by(Feed.id)
|
|
|
|
totalfeeds = 0
|
|
|
|
totalentries = 0
|
|
|
|
for feed in allfeeds:
|
2013-04-04 20:40:19 +02:00
|
|
|
print unicode(feed)
|
2013-04-03 23:31:57 +02:00
|
|
|
totalfeeds += 1
|
2013-04-04 20:40:19 +02:00
|
|
|
totalentries += len(feed.entry)
|
2013-04-03 23:31:57 +02:00
|
|
|
print 'TOTAL: %d entries in %d feeds.' % (totalentries, totalfeeds)
|
|
|
|
|
2013-03-18 19:28:47 +01:00
|
|
|
def fetch_all_feeds(session):
|
2010-10-30 11:39:19 +02:00
|
|
|
print 'fetching all feeds...'
|
2013-04-03 23:31:57 +02:00
|
|
|
allfeeds = session.query(Feed).\
|
|
|
|
filter_by(enabled=1).\
|
|
|
|
order_by(Feed.id)
|
|
|
|
for feed in allfeeds:
|
2013-03-18 19:28:47 +01:00
|
|
|
fetch_single_feed(session, feed)
|
2010-10-29 00:28:25 +02:00
|
|
|
print
|
|
|
|
|
2013-04-03 23:47:05 +02:00
|
|
|
def delete_feed(session, feed_id):
|
|
|
|
print 'deleting feed %d...' % feed_id
|
|
|
|
# TODO implement delete
|
|
|
|
|
|
|
|
def reset_feed(session, feed_id):
|
|
|
|
print 'resetting feed %d...' % feed_id
|
|
|
|
# TODO implement reset
|
|
|
|
|
2010-10-30 11:39:19 +02:00
|
|
|
if __name__ == '__main__':
|
2013-03-26 20:20:51 +01:00
|
|
|
streamWriter = codecs.lookup('utf-8')[-1]
|
|
|
|
sys.stdout = streamWriter(sys.stdout)
|
|
|
|
|
2013-03-18 19:28:47 +01:00
|
|
|
config = ConfigParser.ConfigParser()
|
|
|
|
config.read('atomstrom.conf')
|
|
|
|
|
2013-03-21 23:43:10 +01:00
|
|
|
dbconnectstring = '%s://%s:%s@%s/%s?charset=utf8' % (
|
2013-03-18 19:28:47 +01:00
|
|
|
config.get('database', 'engine'),
|
|
|
|
config.get('database', 'user'),
|
|
|
|
config.get('database', 'password'),
|
|
|
|
config.get('database', 'hostname'),
|
|
|
|
config.get('database', 'database'),
|
|
|
|
)
|
|
|
|
engine = create_engine(dbconnectstring)
|
|
|
|
Base.metadata.create_all(engine)
|
|
|
|
|
|
|
|
Session = sessionmaker(bind=engine)
|
|
|
|
session = Session()
|
|
|
|
|
|
|
|
#session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1, 1))
|
|
|
|
#session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1, 1))
|
|
|
|
|
2013-04-03 22:49:53 +02:00
|
|
|
parser = ArgumentParser(description='Fetch RSS- and Atom-feeds and send mails.')
|
|
|
|
parser.add_argument('-f', '--fetch', action='store_true', help='fetch all feeds')
|
|
|
|
parser.add_argument('-s', '--single', action='store_true', help='send single mails')
|
|
|
|
parser.add_argument('-d', '--daily', action='store_true', help='send daily digest')
|
2013-04-03 23:05:08 +02:00
|
|
|
parser.add_argument('-l', '--list', action='store_true', help='list all configured feeds')
|
|
|
|
parser.add_argument('-e', '--delete', action='store', type=int, metavar='ID', help='delete feed <ID> from configuration')
|
|
|
|
parser.add_argument('-r', '--reset', action='store', type=int, metavar='ID', help='reset data for feed <ID>')
|
2013-04-03 22:49:53 +02:00
|
|
|
args = parser.parse_args()
|
2010-10-31 19:46:13 +01:00
|
|
|
|
2013-04-03 22:49:53 +02:00
|
|
|
if args.fetch:
|
2013-03-18 19:28:47 +01:00
|
|
|
fetch_all_feeds(session)
|
2013-04-03 22:49:53 +02:00
|
|
|
if args.single:
|
2013-03-22 22:05:15 +01:00
|
|
|
sender = [config.get('email', 'sender_name'), config.get('email', 'sender_address')]
|
|
|
|
receiver = [config.get('email', 'receiver_name'), config.get('email', 'receiver_address')]
|
2013-03-19 20:09:44 +01:00
|
|
|
prefix = config.get('email', 'prefix_single')
|
2013-03-19 19:32:10 +01:00
|
|
|
mail_single_entries(session, sender, receiver, prefix)
|
2013-04-03 22:49:53 +02:00
|
|
|
if args.daily:
|
2013-03-22 22:05:15 +01:00
|
|
|
sender = [config.get('email', 'sender_name'), config.get('email', 'sender_address')]
|
|
|
|
receiver = [config.get('email', 'receiver_name'), config.get('email', 'receiver_address')]
|
2013-03-19 19:32:10 +01:00
|
|
|
prefix = config.get('email', 'prefix_digest')
|
|
|
|
mail_daily_digest(session, sender, receiver, prefix)
|
2013-04-03 23:05:08 +02:00
|
|
|
if args.list:
|
2013-04-03 23:31:57 +02:00
|
|
|
list_all_feeds(session)
|
2013-04-03 23:05:08 +02:00
|
|
|
if args.delete:
|
2013-04-03 23:47:05 +02:00
|
|
|
delete_feed(session, args.delete)
|
2013-04-03 23:05:08 +02:00
|
|
|
if args.reset:
|
2013-04-03 23:47:05 +02:00
|
|
|
reset_feed(session, args.reset)
|
2013-04-03 23:05:08 +02:00
|
|
|
if not (args.fetch or args.single or args.daily or args.list or args.delete or args.reset):
|
2013-03-19 07:15:24 +01:00
|
|
|
parser.print_help()
|
2010-10-26 23:02:37 +02:00
|
|
|
|
2013-03-18 19:28:47 +01:00
|
|
|
session.commit()
|