Merge pull request #38 from dflock/post_stats

Post stats
2013-06-24 01:00:23 -07:00
parent 6cb0c1a925 f629e3b012
commit 9e4e23d36e
4 changed files with 177 additions and 0 deletions
--- a/post_stats/init.py
+++ b/post_stats/init.py
@@ -0,0 +1 @@
 from .post_stats import *
--- a/post_stats/post_stats.py
+++ b/post_stats/post_stats.py
@@ -0,0 +1,71 @@
 # -*- coding: utf-8 -*-
 """
 Post Statistics
 ========================
 This plugin calculates various statistics about a post and stores them in an article.stats dictionary:
 wc: how many words
 read_mins: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
 word_counts: frquency count of all the words in the article; can be used for tag/word clouds/
 fi: Flesch-kincaid Index/ Reading Ease
 fk: Flesch-kincaid Grade Level
 """
 from pelican import signals
 from bs4 import BeautifulSoup
 import re
 from collections import Counter
 from .readability import *
 def calculate_stats(instance):
    if instance._content is not None:
        stats = {}
        content = instance._content
        # How fast do average people read?
        WPM = 250
        # Use BeautifulSoup to get readable/visible text
        raw_text = BeautifulSoup(content).getText()
        # Process the text to remove entities
        entities = r'\&\#?.+?;'
        raw_text = raw_text.replace('&nbsp;', ' ')
        raw_text = re.sub(entities, '', raw_text)
        # Flesch-kincaid readbility stats counts sentances,
        # so save before removing punctuation
        tmp = raw_text
        # Process the text to remove punctuation
        drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
        raw_text = raw_text.translate(dict((ord(c), u'') for c in drop))
        # Count the words in the text
        words = raw_text.lower().split()
        word_count = Counter(words)
        # Return the stats
        stats['word_counts'] = word_count
        stats['wc'] = sum(word_count.values())
        # Calulate how long it'll take to read, rounding up
        stats['read_mins'] = (stats['wc'] + WPM - 1) // WPM
        if stats['read_mins'] == 0:
            stats['read_mins'] = 1
        # Calculate Flesch-kincaid readbility stats
        readability_stats = stcs, words, sbls = text_stats(tmp, stats['wc'])
        stats['fi'] = "{:.2f}".format(flesch_index(readability_stats))
        stats['fk'] = "{:.2f}".format(flesch_kincaid_level(readability_stats))
        instance.stats = stats
 def register():
    signals.content_object_init.connect(calculate_stats)
--- a/post_stats/readability.py
+++ b/post_stats/readability.py
@@ -0,0 +1,56 @@
 # -*- coding: utf-8 -*-
 # Adadpted from here: http://acdx.net/calculating-the-flesch-kincaid-level-in-python/
 # See here for details: http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_test
 from __future__ import division
 import re
 def mean(seq):
    return sum(seq) / len(seq)
 def syllables(word):
    if len(word) <= 3:
        return 1
    word = re.sub(r"(es|ed|(?<!l)e)$", "", word)
    return len(re.findall(r"[aeiouy]+", word))
 def normalize(text):
    terminators = ".!?:;"
    term = re.escape(terminators)
    text = re.sub(r"[^%s\sA-Za-z]+" % term, "", text)
    text = re.sub(r"\s*([%s]+\s*)+" % term, ". ", text)
    return re.sub(r"\s+", " ", text)
 def text_stats(text, wc):
    text = normalize(text)
    stcs = [s.split(" ") for s in text.split(". ")]
    stcs = filter(lambda s: len(s) >= 2, stcs)
    if wc:
        words = wc
    else:
        words = sum(len(s) for s in stcs)
    sbls = sum(syllables(w) for s in stcs for w in s)
    return len(stcs), words, sbls
 def flesch_index(stats):
    stcs, words, sbls = stats
    if stcs == 0 or words == 0:
        return 0
    return 206.835 - 1.015 * (words / stcs) - 84.6 * (sbls / words)
 def flesch_kincaid_level(stats):
    stcs, words, sbls = stats
    if stcs == 0 or words == 0:
        return 0
    return 0.39 * (words / stcs) + 11.8 * (sbls / words) - 15.59
--- a/post_stats/readme.rst
+++ b/post_stats/readme.rst
@@ -0,0 +1,49 @@
 Post Statistics
 ==================
 A Pelican plugin to calculate various statistics about a post and store them in an article.stats dictionary:
 - ``wc``: how many words
 - ``read_mins``: how many minutes would it take to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
 - ``word_counts``: frquency count of all the words in the article; can be used for tag/word clouds
 - ``fi``: Flesch-kincaid Index/ Reading Ease (see: http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests)
 - ``fk``: Flesch-kincaid Grade Level
 Example:
 .. code-block:: python
    {
        'wc': 2760,
        'fi': '65.94',
        'fk': '7.65',
        'word_counts': Counter({u'to': 98, u'a': 90, u'the': 83, u'of': 50, ...}),
        'read_mins': 12
    }
 This allows you to output these values in your templates, like this, for example:
 .. code-block:: html+jinja
 	<p title="~{{ article.stats['wc'] }} words">~{{ article.stats['read_mins'] }} min read</p>
 	<ul>
 	    <li>Flesch-kincaid Index/ Reading Ease: {{ article.stats['fi'] }}</li>
 	    <li>Flesch-kincaid Grade Level: {{ article.stats['fk'] }}</li>
 	</ul>
 The ``word_counts`` variable is a python ``Counter`` dictionary and looks something like this, with each unique word and it's frequency:
 .. code-block:: python
 	Counter({u'to': 98, u'a': 90, u'the': 83, u'of': 50, u'karma': 50, .....
 and can be used to create a tag/word cloud for a post.
 Requirements
 ----------------
 `post_stats` requires BeautifulSoup.
 .. code-block:: console
    $ pip install beautifulsoup4