Initial commit of Post Statistics plugin

2013-06-23 02:29:08 -07:00
parent 6cb0c1a925
commit d38b4a8028
2 changed files with 75 additions and 0 deletions
--- a/post_stats/init.py
+++ b/post_stats/init.py
@@ -0,0 +1 @@
 from .post_stats import *
--- a/post_stats/post_stats.py
+++ b/post_stats/post_stats.py
@@ -0,0 +1,74 @@
 # -*- coding: utf-8 -*-
 """
 Post Statistics
 ========================
 This plugin calculates various Statistics about a post and stores them in an article.stats disctionary.
 wc: how many words
 read_minutes: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
 word_count: frquency count of all the words in the article; can be used for tag/word clouds/
 """
 from pelican import signals, contents
 # import nltk
 from bs4 import BeautifulSoup
 # import lxml.html
 # from lxml.html.clean import Cleaner
 import re
 from collections import Counter
 def calculate_stats(instance):
    WPM = 250
    if instance._content is not None:
        stats = {}
        content = instance._content
        # print content
        entities = r'\&\#?.+?;'
        content = content.replace('&nbsp;', ' ')
        content = re.sub(entities, '', content)
        # print content
        # Pre-process the text to remove punctuation
        drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
        content = content.translate(dict((ord(c), u'') for c in drop))
        # nltk
        # raw_text = nltk.clean_html(content)
        # BeautifulSoup
        raw_text = BeautifulSoup(content).getText()
        # raw_text = ''.join(BeautifulSoup(content).findAll(text=True))
        # lxml
        # cleaner = Cleaner(style=True)
        # html = lxml.html.fromstring(content)
        # raw_text = cleaner.clean_html(html).text_content()
        # stats['wc'] = len(re.findall(r'\b', raw_text)) >> 1
        # print raw_text
        words = raw_text.lower().split()
        word_count = Counter(words)
        # print word_count
        stats['word_counts'] = word_count
        stats['wc'] = sum(word_count.values())
        stats['read_minutes'] = (stats['wc'] + WPM // 2) // WPM
        instance.stats = stats
        instance.raw_text = raw_text
 def register():
    signals.content_object_init.connect(calculate_stats)