implemented newer version of readability-module
This commit is contained in:
21
readability/encoding.py
Normal file
21
readability/encoding.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import re
|
||||
import chardet
|
||||
|
||||
def get_encoding(page):
|
||||
text = re.sub('</?[^>]*>\s*', ' ', page)
|
||||
enc = 'utf-8'
|
||||
if not text.strip() or len(text) < 10:
|
||||
return enc # can't guess
|
||||
try:
|
||||
diff = text.decode(enc, 'ignore').encode(enc)
|
||||
sizes = len(diff), len(text)
|
||||
if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
|
||||
return enc
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
res = chardet.detect(text)
|
||||
enc = res['encoding']
|
||||
#print '->', enc, "%.2f" % res['confidence']
|
||||
if enc == 'MacCyrillic':
|
||||
enc = 'cp1251'
|
||||
return enc
|
||||
Reference in New Issue
Block a user