From e4fb328d483ffc88db2a37314d8f745ec08005a2 Mon Sep 17 00:00:00 2001 From: Ronald Schaten Date: Sat, 30 Oct 2010 00:21:24 +0200 Subject: [PATCH] implemented readability-support --- atomstrom.py | 9 ++ hn.py | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 hn.py diff --git a/atomstrom.py b/atomstrom.py index e971329..365b56f 100755 --- a/atomstrom.py +++ b/atomstrom.py @@ -8,6 +8,7 @@ import feedparser import re import sys import urllib +import hn Base = declarative_base() @@ -91,6 +92,7 @@ class Entry(Base): enclosures = Column(Text) fullpage = Column(Text) + readability = Column(Text) lastfetched = Column(DateTime) sent = Column(DateTime) @@ -125,6 +127,11 @@ session = Session() #session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1)) #session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1)) +def fetch_readability(link): + text = hn.upgradeLink(link) + text = text.decode('utf8') + return text + def fetch_full_page(link): opener = urllib.FancyURLopener({}) response = opener.open(link) @@ -140,6 +147,8 @@ def process_feed_entry(feed, entry): thisentry = Entry(entry) if feed.fullpage == 1: thisentry.fullpage = fetch_full_page(entry.link) + if feed.readability == 1: + thisentry.readability = fetch_readability(entry.link) feed.entry.append(thisentry) return "+" diff --git a/hn.py b/hn.py new file mode 100644 index 0000000..71dd4c2 --- /dev/null +++ b/hn.py @@ -0,0 +1,231 @@ +""" +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +""" + + +from xml.sax.saxutils import escape + +import urllib, re, os, urlparse +import HTMLParser, feedparser +from BeautifulSoup import BeautifulSoup +from pprint import pprint + +import codecs +import sys +streamWriter = codecs.lookup('utf-8')[-1] +sys.stdout = streamWriter(sys.stdout) + + +HN_RSS_FEED = "http://news.ycombinator.com/rss" + +NEGATIVE = re.compile("comment|meta|footer|footnote|foot") +POSITIVE = re.compile("post|hentry|entry|content|text|body|article") +PUNCTUATION = re.compile("""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]""") + + +def grabContent(link, html): + + replaceBrs = re.compile("
[ \r\n]*
") + html = re.sub(replaceBrs, "

", html) + + try: + soup = BeautifulSoup(html) + except HTMLParser.HTMLParseError: + return "" + + # REMOVE SCRIPTS + for s in soup.findAll("script"): + s.extract() + + allParagraphs = soup.findAll("p") + topParent = None + + parents = [] + for paragraph in allParagraphs: + + parent = paragraph.parent + + if (parent not in parents): + parents.append(parent) + parent.score = 0 + + if (parent.has_key("class")): + if (NEGATIVE.match(parent["class"])): + parent.score -= 50 + if (POSITIVE.match(parent["class"])): + parent.score += 25 + + if (parent.has_key("id")): + if (NEGATIVE.match(parent["id"])): + parent.score -= 50 + if (POSITIVE.match(parent["id"])): + parent.score += 25 + + if (parent.score == None): + parent.score = 0 + + innerText = paragraph.renderContents() #"".join(paragraph.findAll(text=True)) + if (len(innerText) > 10): + parent.score += 1 + + parent.score += innerText.count(",") + + for parent in parents: + if ((not topParent) or (parent.score > topParent.score)): + topParent = parent + + if (not topParent): + return "" + + # REMOVE LINK'D STYLES + styleLinks = soup.findAll("link", attrs={"type" : "text/css"}) + for s in styleLinks: + s.extract() + + # REMOVE ON PAGE STYLES + for s in soup.findAll("style"): + s.extract() + + # CLEAN STYLES FROM ELEMENTS IN TOP PARENT + for ele in topParent.findAll(True): + del(ele['style']) + del(ele['class']) + + killDivs(topParent) + clean(topParent, "form") + clean(topParent, "object") + clean(topParent, "iframe") + + fixLinks(topParent, link) + + return topParent.renderContents() + + +def fixLinks(parent, link): + tags = parent.findAll(True) + + for t in tags: + if (t.has_key("href")): + t["href"] = urlparse.urljoin(link, t["href"]) + if (t.has_key("src")): + t["src"] = urlparse.urljoin(link, t["src"]) + + +def clean(top, tag, minWords=10000): + tags = top.findAll(tag) + + for t in tags: + if (t.renderContents().count(" ") < minWords): + t.extract() + + +def killDivs(parent): + + divs = parent.findAll("div") + for d in divs: + p = len(d.findAll("p")) + img = len(d.findAll("img")) + li = len(d.findAll("li")) + a = len(d.findAll("a")) + embed = len(d.findAll("embed")) + pre = len(d.findAll("pre")) + code = len(d.findAll("code")) + + if (d.renderContents().count(",") < 10): + if ((pre == 0) and (code == 0)): + if ((img > p ) or (li > p) or (a > p) or (p == 0) or (embed > 0)): + d.extract() + + +def upgradeLink(link): + + link = link.encode('utf-8') + + if (not (link.startswith("http://news.ycombinator.com") or link.endswith(".pdf"))): + linkFile = "upgraded/" + re.sub(PUNCTUATION, "_", link) + if (os.path.exists(linkFile)): + return open(linkFile).read() + else: + content = "" + try: + html = urllib.urlopen(link).read() + content = grabContent(link, html) + filp = open(linkFile, "w") + filp.write(content) + filp.close() + except IOError: + pass + return content + else: + return "" + + + +def upgradeFeed(feedUrl): + + feedData = urllib.urlopen(feedUrl).read() + + upgradedLinks = [] + parsedFeed = feedparser.parse(feedData) + + for entry in parsedFeed.entries: + upgradedLinks.append((entry, upgradeLink(entry.link))) + + rss = """ + + Hacker News + http://news.ycombinator.com/ + Links for the intellectually curious, ranked by readers. + + """ + + for entry, content in upgradedLinks: + rss += u""" + + %s + %s + %s + + Comments
%s
Comments]]> +
+
+""" % (entry.title, escape(entry.link), escape(entry.comments), entry.comments, content.decode('utf-8'), entry.comments) + + rss += """ +
+
""" + + + return rss + +if __name__ == "__main__": + print upgradeFeed(HN_RSS_FEED) + + + + + + + + + + + + + + + + + +