implemented readability-support
This commit is contained in:
		@@ -8,6 +8,7 @@ import feedparser
 | 
				
			|||||||
import re
 | 
					import re
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import urllib
 | 
					import urllib
 | 
				
			||||||
 | 
					import hn
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Base = declarative_base()
 | 
					Base = declarative_base()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -91,6 +92,7 @@ class Entry(Base):
 | 
				
			|||||||
    enclosures = Column(Text)
 | 
					    enclosures = Column(Text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fullpage = Column(Text)
 | 
					    fullpage = Column(Text)
 | 
				
			||||||
 | 
					    readability = Column(Text)
 | 
				
			||||||
    lastfetched = Column(DateTime)
 | 
					    lastfetched = Column(DateTime)
 | 
				
			||||||
    sent = Column(DateTime)
 | 
					    sent = Column(DateTime)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -125,6 +127,11 @@ session = Session()
 | 
				
			|||||||
#session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1))
 | 
					#session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1))
 | 
				
			||||||
#session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1))
 | 
					#session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def fetch_readability(link):
 | 
				
			||||||
 | 
					    text = hn.upgradeLink(link)
 | 
				
			||||||
 | 
					    text = text.decode('utf8')
 | 
				
			||||||
 | 
					    return text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def fetch_full_page(link):
 | 
					def fetch_full_page(link):
 | 
				
			||||||
    opener = urllib.FancyURLopener({})
 | 
					    opener = urllib.FancyURLopener({})
 | 
				
			||||||
    response = opener.open(link)
 | 
					    response = opener.open(link)
 | 
				
			||||||
@@ -140,6 +147,8 @@ def process_feed_entry(feed, entry):
 | 
				
			|||||||
        thisentry = Entry(entry)
 | 
					        thisentry = Entry(entry)
 | 
				
			||||||
        if feed.fullpage == 1:
 | 
					        if feed.fullpage == 1:
 | 
				
			||||||
            thisentry.fullpage = fetch_full_page(entry.link)
 | 
					            thisentry.fullpage = fetch_full_page(entry.link)
 | 
				
			||||||
 | 
					        if feed.readability == 1:
 | 
				
			||||||
 | 
					            thisentry.readability = fetch_readability(entry.link)
 | 
				
			||||||
        feed.entry.append(thisentry)
 | 
					        feed.entry.append(thisentry)
 | 
				
			||||||
        return "+"
 | 
					        return "+"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										231
									
								
								hn.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										231
									
								
								hn.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,231 @@
 | 
				
			|||||||
 | 
					"""
 | 
				
			||||||
 | 
					This program is free software: you can redistribute it and/or modify
 | 
				
			||||||
 | 
					it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					the Free Software Foundation, either version 3 of the License, or
 | 
				
			||||||
 | 
					(at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You should have received a copy of the GNU General Public License
 | 
				
			||||||
 | 
					along with this program.  If not, see <http://www.gnu.org/licenses/>.
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from xml.sax.saxutils import escape
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import urllib, re, os, urlparse
 | 
				
			||||||
 | 
					import HTMLParser, feedparser
 | 
				
			||||||
 | 
					from BeautifulSoup import BeautifulSoup
 | 
				
			||||||
 | 
					from pprint import pprint
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import codecs
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					streamWriter = codecs.lookup('utf-8')[-1]
 | 
				
			||||||
 | 
					sys.stdout = streamWriter(sys.stdout)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					HN_RSS_FEED = "http://news.ycombinator.com/rss"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NEGATIVE    = re.compile("comment|meta|footer|footnote|foot")
 | 
				
			||||||
 | 
					POSITIVE    = re.compile("post|hentry|entry|content|text|body|article")
 | 
				
			||||||
 | 
					PUNCTUATION = re.compile("""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]""")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def grabContent(link, html):
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    replaceBrs = re.compile("<br */? *>[ \r\n]*<br */? *>")
 | 
				
			||||||
 | 
					    html = re.sub(replaceBrs, "</p><p>", html)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        soup = BeautifulSoup(html)
 | 
				
			||||||
 | 
					    except HTMLParser.HTMLParseError:
 | 
				
			||||||
 | 
					        return ""
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # REMOVE SCRIPTS
 | 
				
			||||||
 | 
					    for s in soup.findAll("script"):
 | 
				
			||||||
 | 
					        s.extract()
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    allParagraphs = soup.findAll("p")
 | 
				
			||||||
 | 
					    topParent     = None
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    parents = []
 | 
				
			||||||
 | 
					    for paragraph in allParagraphs:
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        parent = paragraph.parent
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        if (parent not in parents):
 | 
				
			||||||
 | 
					            parents.append(parent)
 | 
				
			||||||
 | 
					            parent.score = 0
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            if (parent.has_key("class")):
 | 
				
			||||||
 | 
					                if (NEGATIVE.match(parent["class"])):
 | 
				
			||||||
 | 
					                    parent.score -= 50
 | 
				
			||||||
 | 
					                if (POSITIVE.match(parent["class"])):
 | 
				
			||||||
 | 
					                    parent.score += 25
 | 
				
			||||||
 | 
					                    
 | 
				
			||||||
 | 
					            if (parent.has_key("id")):
 | 
				
			||||||
 | 
					                if (NEGATIVE.match(parent["id"])):
 | 
				
			||||||
 | 
					                    parent.score -= 50
 | 
				
			||||||
 | 
					                if (POSITIVE.match(parent["id"])):
 | 
				
			||||||
 | 
					                    parent.score += 25
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if (parent.score == None):
 | 
				
			||||||
 | 
					            parent.score = 0
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        innerText = paragraph.renderContents() #"".join(paragraph.findAll(text=True))
 | 
				
			||||||
 | 
					        if (len(innerText) > 10):
 | 
				
			||||||
 | 
					            parent.score += 1
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					        parent.score += innerText.count(",")
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    for parent in parents:
 | 
				
			||||||
 | 
					        if ((not topParent) or (parent.score > topParent.score)):
 | 
				
			||||||
 | 
					            topParent = parent
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (not topParent):
 | 
				
			||||||
 | 
					        return ""
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					    # REMOVE LINK'D STYLES
 | 
				
			||||||
 | 
					    styleLinks = soup.findAll("link", attrs={"type" : "text/css"})
 | 
				
			||||||
 | 
					    for s in styleLinks:
 | 
				
			||||||
 | 
					        s.extract()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # REMOVE ON PAGE STYLES
 | 
				
			||||||
 | 
					    for s in soup.findAll("style"):
 | 
				
			||||||
 | 
					        s.extract()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # CLEAN STYLES FROM ELEMENTS IN TOP PARENT
 | 
				
			||||||
 | 
					    for ele in topParent.findAll(True):
 | 
				
			||||||
 | 
					        del(ele['style'])
 | 
				
			||||||
 | 
					        del(ele['class'])
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    killDivs(topParent)
 | 
				
			||||||
 | 
					    clean(topParent, "form")
 | 
				
			||||||
 | 
					    clean(topParent, "object")
 | 
				
			||||||
 | 
					    clean(topParent, "iframe")
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    fixLinks(topParent, link)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return topParent.renderContents()
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def fixLinks(parent, link):
 | 
				
			||||||
 | 
					    tags = parent.findAll(True)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    for t in tags:
 | 
				
			||||||
 | 
					        if (t.has_key("href")):
 | 
				
			||||||
 | 
					            t["href"] = urlparse.urljoin(link, t["href"])
 | 
				
			||||||
 | 
					        if (t.has_key("src")):
 | 
				
			||||||
 | 
					            t["src"] = urlparse.urljoin(link, t["src"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def clean(top, tag, minWords=10000):
 | 
				
			||||||
 | 
					    tags = top.findAll(tag)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for t in tags:
 | 
				
			||||||
 | 
					        if (t.renderContents().count(" ") < minWords):
 | 
				
			||||||
 | 
					            t.extract()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def killDivs(parent):
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    divs = parent.findAll("div")
 | 
				
			||||||
 | 
					    for d in divs:
 | 
				
			||||||
 | 
					        p     = len(d.findAll("p"))
 | 
				
			||||||
 | 
					        img   = len(d.findAll("img"))
 | 
				
			||||||
 | 
					        li    = len(d.findAll("li"))
 | 
				
			||||||
 | 
					        a     = len(d.findAll("a"))
 | 
				
			||||||
 | 
					        embed = len(d.findAll("embed"))
 | 
				
			||||||
 | 
					        pre   = len(d.findAll("pre"))
 | 
				
			||||||
 | 
					        code  = len(d.findAll("code"))
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					        if (d.renderContents().count(",") < 10):
 | 
				
			||||||
 | 
					            if ((pre == 0) and (code == 0)):
 | 
				
			||||||
 | 
					                if ((img > p ) or (li > p) or (a > p) or (p == 0) or (embed > 0)):
 | 
				
			||||||
 | 
					                    d.extract()
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def upgradeLink(link):
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    link = link.encode('utf-8')
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    if (not (link.startswith("http://news.ycombinator.com") or link.endswith(".pdf"))):
 | 
				
			||||||
 | 
					        linkFile = "upgraded/" + re.sub(PUNCTUATION, "_", link)
 | 
				
			||||||
 | 
					        if (os.path.exists(linkFile)):
 | 
				
			||||||
 | 
					            return open(linkFile).read()
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            content = ""
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                html = urllib.urlopen(link).read()
 | 
				
			||||||
 | 
					                content = grabContent(link, html)
 | 
				
			||||||
 | 
					                filp = open(linkFile, "w")
 | 
				
			||||||
 | 
					                filp.write(content)
 | 
				
			||||||
 | 
					                filp.close()
 | 
				
			||||||
 | 
					            except IOError:
 | 
				
			||||||
 | 
					                pass
 | 
				
			||||||
 | 
					            return content
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        return ""
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def upgradeFeed(feedUrl):
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    feedData = urllib.urlopen(feedUrl).read()
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    upgradedLinks = []
 | 
				
			||||||
 | 
					    parsedFeed = feedparser.parse(feedData)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    for entry in parsedFeed.entries:
 | 
				
			||||||
 | 
					        upgradedLinks.append((entry, upgradeLink(entry.link)))
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    rss = """<rss version="2.0">
 | 
				
			||||||
 | 
					<channel>
 | 
				
			||||||
 | 
						<title>Hacker News</title>
 | 
				
			||||||
 | 
						<link>http://news.ycombinator.com/</link>
 | 
				
			||||||
 | 
						<description>Links for the intellectually curious, ranked by readers.</description>
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for entry, content in upgradedLinks:
 | 
				
			||||||
 | 
					        rss += u"""
 | 
				
			||||||
 | 
					    <item>
 | 
				
			||||||
 | 
					        <title>%s</title>
 | 
				
			||||||
 | 
					        <link>%s</link>
 | 
				
			||||||
 | 
					        <comments>%s</comments>
 | 
				
			||||||
 | 
					        <description>
 | 
				
			||||||
 | 
					            <![CDATA[<a href="%s">Comments</a><br/>%s<br/><a href="%s">Comments</a>]]>
 | 
				
			||||||
 | 
					        </description>
 | 
				
			||||||
 | 
					    </item>
 | 
				
			||||||
 | 
					""" % (entry.title, escape(entry.link), escape(entry.comments), entry.comments, content.decode('utf-8'), entry.comments)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    rss += """
 | 
				
			||||||
 | 
					</channel>
 | 
				
			||||||
 | 
					</rss>"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return rss
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					if __name__ == "__main__":  
 | 
				
			||||||
 | 
					    print upgradeFeed(HN_RSS_FEED)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		Reference in New Issue
	
	Block a user