diff --git a/post_stats/__init__.py b/post_stats/__init__.py new file mode 100644 index 0000000..2313bd5 --- /dev/null +++ b/post_stats/__init__.py @@ -0,0 +1 @@ +from .post_stats import * diff --git a/post_stats/post_stats.py b/post_stats/post_stats.py new file mode 100644 index 0000000..7ab30af --- /dev/null +++ b/post_stats/post_stats.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +""" +Post Statistics +======================== + +This plugin calculates various Statistics about a post and stores them in an article.stats disctionary. + +wc: how many words +read_minutes: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension) +word_count: frquency count of all the words in the article; can be used for tag/word clouds/ + +""" + +from pelican import signals, contents + +# import nltk + +from bs4 import BeautifulSoup + +# import lxml.html +# from lxml.html.clean import Cleaner + +import re +from collections import Counter + + +def calculate_stats(instance): + + WPM = 250 + + if instance._content is not None: + stats = {} + content = instance._content + + # print content + entities = r'\&\#?.+?;' + content = content.replace(' ', ' ') + content = re.sub(entities, '', content) + # print content + + # Pre-process the text to remove punctuation + drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”' + content = content.translate(dict((ord(c), u'') for c in drop)) + + # nltk + # raw_text = nltk.clean_html(content) + + # BeautifulSoup + raw_text = BeautifulSoup(content).getText() + # raw_text = ''.join(BeautifulSoup(content).findAll(text=True)) + + # lxml + # cleaner = Cleaner(style=True) + # html = lxml.html.fromstring(content) + # raw_text = cleaner.clean_html(html).text_content() + + # stats['wc'] = len(re.findall(r'\b', raw_text)) >> 1 + + # print raw_text + + words = raw_text.lower().split() + word_count = Counter(words) + # print word_count + + stats['word_counts'] = word_count + stats['wc'] = sum(word_count.values()) + stats['read_minutes'] = (stats['wc'] + WPM // 2) // WPM + + instance.stats = stats + instance.raw_text = raw_text + + +def register(): + signals.content_object_init.connect(calculate_stats)