refactored processing of entries, enabled processing of enclosures

This commit is contained in:
2013-04-08 21:02:48 +02:00
parent 5b9cc700f8
commit 2718e6502c
2 changed files with 84 additions and 76 deletions

View File

@ -10,11 +10,6 @@ from ddate import ddate
import feedparser
import sys
import codecs
#import urllib
import urllib2
#import hn
import html2text
import HTMLParser
import ConfigParser
from argparse import ArgumentParser
from email.header import Header
@ -112,19 +107,6 @@ def mail_single_entries(session, sender, receiver, prefix):
else:
print 'no unmailed single entries found... not sending mail.'
def fetch_readability(link):
text = hn.upgradeLink(link)
text = text.decode('utf8')
return text
def fetch_full_page(link):
opener = urllib.FancyURLopener({})
response = opener.open(link)
html = response.read()
html = html.decode('utf8')
text = html2text.html2text(html)
return text.encode('latin-1', 'replace')
def process_feed_entry(session, feed, entry):
thisentry = session.query(Entry).\
filter(Entry.title == entry.title).\
@ -137,45 +119,7 @@ def process_feed_entry(session, feed, entry):
return 0
else:
print ' new entry <%s>' % entry.title
thisentry = Entry(entry)
if feed.resolveredirects:
print ' fetching final link <%s>' % entry.link
request = urllib2.Request(entry.link)
opener = urllib2.build_opener()
result = opener.open(request)
thisentry.resolvedlink = result.url
print ' final link: <%s>' % result.url
if feed.fullpage:
print ' fetching full page <%s>' % entry.link
thisentry.fullpage = fetch_full_page(entry.link)
if feed.readability:
print ' fetching readability <%s>' % entry.link
thisentry.readability = fetch_readability(entry.link)
if feed.html2textcontent:
print ' converting summary'
h2t = html2text.HTML2Text()
h2t.body_width = 0
h2t.inline_links = False
if feed.html2textignoreimages:
h2t.ignore_images = True
if feed.contentcolumn == 'summary':
thisentry.summary = h2t.handle(thisentry.summary)
elif feed.contentcolumn == 'content':
thisentry.content = h2t.handle(thisentry.content)
elif feed.contentcolumn == 'fullpage':
thisentry.fullpage = h2t.handle(thisentry.fullpage)
elif feed.contentcolumn == 'readability':
thisentry.readability = h2t.handle(thisentry.readability)
hp = HTMLParser.HTMLParser()
if thisentry.summary:
thisentry.summary = hp.unescape(thisentry.summary)
if thisentry.content:
thisentry.content = hp.unescape(thisentry.content)
if thisentry.fullpage:
thisentry.fullpage = hp.unescape(thisentry.fullpage)
if thisentry.readability:
thisentry.readability = hp.unescape(thisentry.readability)
feed.entries.append(thisentry)
feed.entries.append(Entry(entry, feed))
session.commit()
return 1