Tidies up, removed commented out nltk & lxml alternative versions.

This commit is contained in:
Duncan Lock
2013-06-23 14:25:10 -07:00
parent 2dd13f3fa0
commit 546d003682

View File

@@ -3,76 +3,54 @@
Post Statistics
========================
This plugin calculates various Statistics about a post and stores them in an article.stats disctionary.
This plugin calculates various Statistics about a post and stores them in an article.stats disctionary:
wc: how many words
read_minutes: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
word_count: frquency count of all the words in the article; can be used for tag/word clouds/
read_mins: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
word_counts: frquency count of all the words in the article; can be used for tag/word clouds/
"""
from pelican import signals
# import math
# import nltk
from bs4 import BeautifulSoup
# import lxml.html
# from lxml.html.clean import Cleaner
import re
from collections import Counter
def calculate_stats(instance):
# How fast do average people read?
WPM = 250
if instance._content is not None:
stats = {}
content = instance._content
# print content
# How fast do average people read?
WPM = 250
# Pre-process the text to remove entities
entities = r'\&\#?.+?;'
content = content.replace(' ', ' ')
content = re.sub(entities, '', content)
# print content
# Pre-process the text to remove punctuation
drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
content = content.translate(dict((ord(c), u'') for c in drop))
# nltk
# raw_text = nltk.clean_html(content)
# BeautifulSoup
# Use BeautifulSoup to get readable/visible text
raw_text = BeautifulSoup(content).getText()
# raw_text = ''.join(BeautifulSoup(content).findAll(text=True))
# lxml
# cleaner = Cleaner(style=True)
# html = lxml.html.fromstring(content)
# raw_text = cleaner.clean_html(html).text_content()
# stats['wc'] = len(re.findall(r'\b', raw_text)) >> 1
# print raw_text
# Count the words in the text
words = raw_text.lower().split()
word_count = Counter(words)
# print word_count
# Return the stats
stats['word_counts'] = word_count
stats['wc'] = sum(word_count.values())
# stats['read_minutes'] = math.ceil(float(stats['wc']) / float(WPM))
stats['read_minutes'] = (stats['wc'] + WPM - 1) // WPM
if stats['read_minutes'] == 0:
stats['read_minutes'] = 1
# Calulate how long it'll take to read, rounding up
stats['read_mins'] = (stats['wc'] + WPM - 1) // WPM
if stats['read_mins'] == 0:
stats['read_mins'] = 1
instance.stats = stats
instance.raw_text = raw_text
def register():