diff --git a/atomstrom.py b/atomstrom.py
index e971329..365b56f 100755
--- a/atomstrom.py
+++ b/atomstrom.py
@@ -8,6 +8,7 @@ import feedparser
import re
import sys
import urllib
+import hn
Base = declarative_base()
@@ -91,6 +92,7 @@ class Entry(Base):
enclosures = Column(Text)
fullpage = Column(Text)
+ readability = Column(Text)
lastfetched = Column(DateTime)
sent = Column(DateTime)
@@ -125,6 +127,11 @@ session = Session()
#session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1))
#session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1))
+def fetch_readability(link):
+ text = hn.upgradeLink(link)
+ text = text.decode('utf8')
+ return text
+
def fetch_full_page(link):
opener = urllib.FancyURLopener({})
response = opener.open(link)
@@ -140,6 +147,8 @@ def process_feed_entry(feed, entry):
thisentry = Entry(entry)
if feed.fullpage == 1:
thisentry.fullpage = fetch_full_page(entry.link)
+ if feed.readability == 1:
+ thisentry.readability = fetch_readability(entry.link)
feed.entry.append(thisentry)
return "+"
diff --git a/hn.py b/hn.py
new file mode 100644
index 0000000..71dd4c2
--- /dev/null
+++ b/hn.py
@@ -0,0 +1,231 @@
+"""
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see
[ \r\n]*
")
+ html = re.sub(replaceBrs, "
", html)
+
+ try:
+ soup = BeautifulSoup(html)
+ except HTMLParser.HTMLParseError:
+ return ""
+
+ # REMOVE SCRIPTS
+ for s in soup.findAll("script"):
+ s.extract()
+
+ allParagraphs = soup.findAll("p")
+ topParent = None
+
+ parents = []
+ for paragraph in allParagraphs:
+
+ parent = paragraph.parent
+
+ if (parent not in parents):
+ parents.append(parent)
+ parent.score = 0
+
+ if (parent.has_key("class")):
+ if (NEGATIVE.match(parent["class"])):
+ parent.score -= 50
+ if (POSITIVE.match(parent["class"])):
+ parent.score += 25
+
+ if (parent.has_key("id")):
+ if (NEGATIVE.match(parent["id"])):
+ parent.score -= 50
+ if (POSITIVE.match(parent["id"])):
+ parent.score += 25
+
+ if (parent.score == None):
+ parent.score = 0
+
+ innerText = paragraph.renderContents() #"".join(paragraph.findAll(text=True))
+ if (len(innerText) > 10):
+ parent.score += 1
+
+ parent.score += innerText.count(",")
+
+ for parent in parents:
+ if ((not topParent) or (parent.score > topParent.score)):
+ topParent = parent
+
+ if (not topParent):
+ return ""
+
+ # REMOVE LINK'D STYLES
+ styleLinks = soup.findAll("link", attrs={"type" : "text/css"})
+ for s in styleLinks:
+ s.extract()
+
+ # REMOVE ON PAGE STYLES
+ for s in soup.findAll("style"):
+ s.extract()
+
+ # CLEAN STYLES FROM ELEMENTS IN TOP PARENT
+ for ele in topParent.findAll(True):
+ del(ele['style'])
+ del(ele['class'])
+
+ killDivs(topParent)
+ clean(topParent, "form")
+ clean(topParent, "object")
+ clean(topParent, "iframe")
+
+ fixLinks(topParent, link)
+
+ return topParent.renderContents()
+
+
+def fixLinks(parent, link):
+ tags = parent.findAll(True)
+
+ for t in tags:
+ if (t.has_key("href")):
+ t["href"] = urlparse.urljoin(link, t["href"])
+ if (t.has_key("src")):
+ t["src"] = urlparse.urljoin(link, t["src"])
+
+
+def clean(top, tag, minWords=10000):
+ tags = top.findAll(tag)
+
+ for t in tags:
+ if (t.renderContents().count(" ") < minWords):
+ t.extract()
+
+
+def killDivs(parent):
+
+ divs = parent.findAll("div")
+ for d in divs:
+ p = len(d.findAll("p"))
+ img = len(d.findAll("img"))
+ li = len(d.findAll("li"))
+ a = len(d.findAll("a"))
+ embed = len(d.findAll("embed"))
+ pre = len(d.findAll("pre"))
+ code = len(d.findAll("code"))
+
+ if (d.renderContents().count(",") < 10):
+ if ((pre == 0) and (code == 0)):
+ if ((img > p ) or (li > p) or (a > p) or (p == 0) or (embed > 0)):
+ d.extract()
+
+
+def upgradeLink(link):
+
+ link = link.encode('utf-8')
+
+ if (not (link.startswith("http://news.ycombinator.com") or link.endswith(".pdf"))):
+ linkFile = "upgraded/" + re.sub(PUNCTUATION, "_", link)
+ if (os.path.exists(linkFile)):
+ return open(linkFile).read()
+ else:
+ content = ""
+ try:
+ html = urllib.urlopen(link).read()
+ content = grabContent(link, html)
+ filp = open(linkFile, "w")
+ filp.write(content)
+ filp.close()
+ except IOError:
+ pass
+ return content
+ else:
+ return ""
+
+
+
+def upgradeFeed(feedUrl):
+
+ feedData = urllib.urlopen(feedUrl).read()
+
+ upgradedLinks = []
+ parsedFeed = feedparser.parse(feedData)
+
+ for entry in parsedFeed.entries:
+ upgradedLinks.append((entry, upgradeLink(entry.link)))
+
+ rss = """
%s
Comments]]>
+