From d38b4a802820a06f8201f300afbb2585f385f9dc Mon Sep 17 00:00:00 2001
From: Duncan Lock <duncan.lock@gmail.com>
Date: Sun, 23 Jun 2013 02:29:08 -0700
Subject: [PATCH 1/6] Initial commit of Post Statistics plugin

---
 post_stats/__init__.py   |  1 +
 post_stats/post_stats.py | 74 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 post_stats/__init__.py
 create mode 100644 post_stats/post_stats.py

diff --git a/post_stats/__init__.py b/post_stats/__init__.py
new file mode 100644
index 0000000..2313bd5
--- /dev/null
+++ b/post_stats/__init__.py
@@ -0,0 +1 @@
+from .post_stats import *
diff --git a/post_stats/post_stats.py b/post_stats/post_stats.py
new file mode 100644
index 0000000..7ab30af
--- /dev/null
+++ b/post_stats/post_stats.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+"""
+Post Statistics
+========================
+
+This plugin calculates various Statistics about a post and stores them in an article.stats disctionary.
+
+wc: how many words
+read_minutes: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
+word_count: frquency count of all the words in the article; can be used for tag/word clouds/
+
+"""
+
+from pelican import signals, contents
+
+# import nltk
+
+from bs4 import BeautifulSoup
+
+# import lxml.html
+# from lxml.html.clean import Cleaner
+
+import re
+from collections import Counter
+
+
+def calculate_stats(instance):
+
+    WPM = 250
+
+    if instance._content is not None:
+        stats = {}
+        content = instance._content
+
+        # print content
+        entities = r'\&\#?.+?;'
+        content = content.replace('&nbsp;', ' ')
+        content = re.sub(entities, '', content)
+        # print content
+
+        # Pre-process the text to remove punctuation
+        drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
+        content = content.translate(dict((ord(c), u'') for c in drop))
+
+        # nltk
+        # raw_text = nltk.clean_html(content)
+
+        # BeautifulSoup
+        raw_text = BeautifulSoup(content).getText()
+        # raw_text = ''.join(BeautifulSoup(content).findAll(text=True))
+
+        # lxml
+        # cleaner = Cleaner(style=True)
+        # html = lxml.html.fromstring(content)
+        # raw_text = cleaner.clean_html(html).text_content()
+
+        # stats['wc'] = len(re.findall(r'\b', raw_text)) >> 1
+
+        # print raw_text
+
+        words = raw_text.lower().split()
+        word_count = Counter(words)
+        # print word_count
+
+        stats['word_counts'] = word_count
+        stats['wc'] = sum(word_count.values())
+        stats['read_minutes'] = (stats['wc'] + WPM // 2) // WPM
+
+        instance.stats = stats
+        instance.raw_text = raw_text
+
+
+def register():
+    signals.content_object_init.connect(calculate_stats)

From b49d8e8083f5d1d84cf8d3f75ebdddb942aa4ba3 Mon Sep 17 00:00:00 2001
From: Duncan Lock <duncan.lock@gmail.com>
Date: Sun, 23 Jun 2013 10:54:54 -0700
Subject: [PATCH 2/6] Minor tidy. simplified read_minutes calculation

---
 post_stats/post_stats.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/post_stats/post_stats.py b/post_stats/post_stats.py
index 7ab30af..f8a91f2 100644
--- a/post_stats/post_stats.py
+++ b/post_stats/post_stats.py
@@ -11,7 +11,8 @@ word_count: frquency count of all the words in the article; can be used for tag/
 
 """
 
-from pelican import signals, contents
+from pelican import signals
+# import math
 
 # import nltk
 
@@ -26,6 +27,7 @@ from collections import Counter
 
 def calculate_stats(instance):
 
+    # How fast do average people read?
     WPM = 250
 
     if instance._content is not None:
@@ -64,7 +66,8 @@ def calculate_stats(instance):
 
         stats['word_counts'] = word_count
         stats['wc'] = sum(word_count.values())
-        stats['read_minutes'] = (stats['wc'] + WPM // 2) // WPM
+        # stats['read_minutes'] = math.ceil(float(stats['wc']) / float(WPM))
+        stats['read_minutes'] = (stats['wc'] + WPM - 1) // WPM
 
         instance.stats = stats
         instance.raw_text = raw_text

From 2dd13f3fa0ed33cba0ae12369bf470d1179a3573 Mon Sep 17 00:00:00 2001
From: Duncan Lock <duncan.lock@gmail.com>
Date: Sun, 23 Jun 2013 11:04:46 -0700
Subject: [PATCH 3/6] Made read_minutes 1 minute min

---
 post_stats/post_stats.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/post_stats/post_stats.py b/post_stats/post_stats.py
index f8a91f2..8270dfb 100644
--- a/post_stats/post_stats.py
+++ b/post_stats/post_stats.py
@@ -68,6 +68,8 @@ def calculate_stats(instance):
         stats['wc'] = sum(word_count.values())
         # stats['read_minutes'] = math.ceil(float(stats['wc']) / float(WPM))
         stats['read_minutes'] = (stats['wc'] + WPM - 1) // WPM
+        if stats['read_minutes'] == 0:
+            stats['read_minutes'] = 1
 
         instance.stats = stats
         instance.raw_text = raw_text

From 546d003682eb7e192f4d54f8d6272e42087ecedf Mon Sep 17 00:00:00 2001
From: Duncan Lock <duncan.lock@gmail.com>
Date: Sun, 23 Jun 2013 14:25:10 -0700
Subject: [PATCH 4/6] Tidies up, removed commented out nltk & lxml alternative
 versions.

---
 post_stats/post_stats.py | 50 +++++++++++-----------------------------
 1 file changed, 14 insertions(+), 36 deletions(-)

diff --git a/post_stats/post_stats.py b/post_stats/post_stats.py
index 8270dfb..6b38dd3 100644
--- a/post_stats/post_stats.py
+++ b/post_stats/post_stats.py
@@ -3,76 +3,54 @@
 Post Statistics
 ========================
 
-This plugin calculates various Statistics about a post and stores them in an article.stats disctionary.
+This plugin calculates various Statistics about a post and stores them in an article.stats disctionary:
 
 wc: how many words
-read_minutes: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
-word_count: frquency count of all the words in the article; can be used for tag/word clouds/
+read_mins: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
+word_counts: frquency count of all the words in the article; can be used for tag/word clouds/
 
 """
 
 from pelican import signals
-# import math
-
-# import nltk
-
 from bs4 import BeautifulSoup
-
-# import lxml.html
-# from lxml.html.clean import Cleaner
-
 import re
 from collections import Counter
 
 
 def calculate_stats(instance):
 
-    # How fast do average people read?
-    WPM = 250
-
     if instance._content is not None:
         stats = {}
         content = instance._content
 
-        # print content
+        # How fast do average people read?
+        WPM = 250
+
+        # Pre-process the text to remove entities
         entities = r'\&\#?.+?;'
         content = content.replace('&nbsp;', ' ')
         content = re.sub(entities, '', content)
-        # print content
 
         # Pre-process the text to remove punctuation
         drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
         content = content.translate(dict((ord(c), u'') for c in drop))
 
-        # nltk
-        # raw_text = nltk.clean_html(content)
-
-        # BeautifulSoup
+        # Use BeautifulSoup to get readable/visible text
         raw_text = BeautifulSoup(content).getText()
-        # raw_text = ''.join(BeautifulSoup(content).findAll(text=True))
-
-        # lxml
-        # cleaner = Cleaner(style=True)
-        # html = lxml.html.fromstring(content)
-        # raw_text = cleaner.clean_html(html).text_content()
-
-        # stats['wc'] = len(re.findall(r'\b', raw_text)) >> 1
-
-        # print raw_text
 
+        # Count the words in the text
         words = raw_text.lower().split()
         word_count = Counter(words)
-        # print word_count
 
+        # Return the stats
         stats['word_counts'] = word_count
         stats['wc'] = sum(word_count.values())
-        # stats['read_minutes'] = math.ceil(float(stats['wc']) / float(WPM))
-        stats['read_minutes'] = (stats['wc'] + WPM - 1) // WPM
-        if stats['read_minutes'] == 0:
-            stats['read_minutes'] = 1
+        # Calulate how long it'll take to read, rounding up
+        stats['read_mins'] = (stats['wc'] + WPM - 1) // WPM
+        if stats['read_mins'] == 0:
+            stats['read_mins'] = 1
 
         instance.stats = stats
-        instance.raw_text = raw_text
 
 
 def register():

From 69f7b1fb0a6c0fcf755e398d1e7b1ab96648c2b0 Mon Sep 17 00:00:00 2001
From: Duncan Lock <duncan.lock@gmail.com>
Date: Sun, 23 Jun 2013 21:19:39 -0700
Subject: [PATCH 5/6] Added Flesch-Kincaid readability scores; added readme;
 final tidy

---
 post_stats/post_stats.py  | 34 +++++++++++++++++-------
 post_stats/readability.py | 56 +++++++++++++++++++++++++++++++++++++++
 post_stats/readme.rst     | 49 ++++++++++++++++++++++++++++++++++
 3 files changed, 129 insertions(+), 10 deletions(-)
 create mode 100644 post_stats/readability.py
 create mode 100644 post_stats/readme.rst

diff --git a/post_stats/post_stats.py b/post_stats/post_stats.py
index 6b38dd3..3f7c81e 100644
--- a/post_stats/post_stats.py
+++ b/post_stats/post_stats.py
@@ -3,11 +3,13 @@
 Post Statistics
 ========================
 
-This plugin calculates various Statistics about a post and stores them in an article.stats disctionary:
+This plugin calculates various statistics about a post and stores them in an article.stats dictionary:
 
 wc: how many words
 read_mins: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
 word_counts: frquency count of all the words in the article; can be used for tag/word clouds/
+fi: Flesch-kincaid Index/ Reading Ease
+fk: Flesch-kincaid Grade Level
 
 """
 
@@ -16,6 +18,8 @@ from bs4 import BeautifulSoup
 import re
 from collections import Counter
 
+from .readability import *
+
 
 def calculate_stats(instance):
 
@@ -26,18 +30,22 @@ def calculate_stats(instance):
         # How fast do average people read?
         WPM = 250
 
-        # Pre-process the text to remove entities
-        entities = r'\&\#?.+?;'
-        content = content.replace('&nbsp;', ' ')
-        content = re.sub(entities, '', content)
-
-        # Pre-process the text to remove punctuation
-        drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
-        content = content.translate(dict((ord(c), u'') for c in drop))
-
         # Use BeautifulSoup to get readable/visible text
         raw_text = BeautifulSoup(content).getText()
 
+        # Process the text to remove entities
+        entities = r'\&\#?.+?;'
+        raw_text = raw_text.replace('&nbsp;', ' ')
+        raw_text = re.sub(entities, '', raw_text)
+
+        # Flesch-kincaid readbility stats counts sentances,
+        # so save before removing punctuation
+        tmp = raw_text
+
+        # Process the text to remove punctuation
+        drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
+        raw_text = raw_text.translate(dict((ord(c), u'') for c in drop))
+
         # Count the words in the text
         words = raw_text.lower().split()
         word_count = Counter(words)
@@ -45,11 +53,17 @@ def calculate_stats(instance):
         # Return the stats
         stats['word_counts'] = word_count
         stats['wc'] = sum(word_count.values())
+
         # Calulate how long it'll take to read, rounding up
         stats['read_mins'] = (stats['wc'] + WPM - 1) // WPM
         if stats['read_mins'] == 0:
             stats['read_mins'] = 1
 
+        # Calculate Flesch-kincaid readbility stats
+        readability_stats = stcs, words, sbls = text_stats(tmp, stats['wc'])
+        stats['fi'] = "{:.2f}".format(flesch_index(readability_stats))
+        stats['fk'] = "{:.2f}".format(flesch_kincaid_level(readability_stats))
+
         instance.stats = stats
 
 
diff --git a/post_stats/readability.py b/post_stats/readability.py
new file mode 100644
index 0000000..0cc1c71
--- /dev/null
+++ b/post_stats/readability.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+# Adadpted from here: http://acdx.net/calculating-the-flesch-kincaid-level-in-python/
+# See here for details: http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_test
+
+from __future__ import division
+import re
+
+
+def mean(seq):
+    return sum(seq) / len(seq)
+
+
+def syllables(word):
+    if len(word) <= 3:
+        return 1
+
+    word = re.sub(r"(es|ed|(?<!l)e)$", "", word)
+    return len(re.findall(r"[aeiouy]+", word))
+
+
+def normalize(text):
+    terminators = ".!?:;"
+    term = re.escape(terminators)
+    text = re.sub(r"[^%s\sA-Za-z]+" % term, "", text)
+    text = re.sub(r"\s*([%s]+\s*)+" % term, ". ", text)
+    return re.sub(r"\s+", " ", text)
+
+
+def text_stats(text, wc):
+    text = normalize(text)
+    stcs = [s.split(" ") for s in text.split(". ")]
+    stcs = filter(lambda s: len(s) >= 2, stcs)
+
+    if wc:
+        words = wc
+    else:
+        words = sum(len(s) for s in stcs)
+
+    sbls = sum(syllables(w) for s in stcs for w in s)
+
+    return len(stcs), words, sbls
+
+
+def flesch_index(stats):
+    stcs, words, sbls = stats
+    if stcs == 0 or words == 0:
+        return 0
+    return 206.835 - 1.015 * (words / stcs) - 84.6 * (sbls / words)
+
+
+def flesch_kincaid_level(stats):
+    stcs, words, sbls = stats
+    if stcs == 0 or words == 0:
+        return 0
+    return 0.39 * (words / stcs) + 11.8 * (sbls / words) - 15.59
diff --git a/post_stats/readme.rst b/post_stats/readme.rst
new file mode 100644
index 0000000..f607d3b
--- /dev/null
+++ b/post_stats/readme.rst
@@ -0,0 +1,49 @@
+Post Statistics
+==================
+
+A Pelican plugin to calculate various statistics about a post and store them in an article.stats dictionary:
+
+- ``wc``: how many words
+- ``read_mins``: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
+- ``word_counts``: frquency count of all the words in the article; can be used for tag/word clouds/
+- ``fi``: Flesch-kincaid Index/ Reading Ease (see: http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests)
+- ``fk``: Flesch-kincaid Grade Level
+
+Example:
+
+.. code-block:: python
+
+    {
+        'wc': 2760,
+        'fi': '65.94',
+        'fk': '7.65',
+        'word_counts': Counter({u'to': 98, u'a': 90, u'the': 83, u'of': 50, ...}),
+        'read_mins': 12
+    }
+
+This allows you to output these values in your templates, like this, for example:
+
+.. code-block:: html+jinja
+
+	<p title="~{{ article.stats['wc'] }} words">~{{ article.stats['read_mins'] }} min read</p>
+	<ul>
+	    <li>Flesch-kincaid Index/ Reading Ease: {{ article.stats['fi'] }}</li>
+	    <li>Flesch-kincaid Grade Level: {{ article.stats['fk'] }}</li>
+	</ul>
+
+The ``word_counts`` variable is a Counter dictionary and looks like this, with each unique word and it's frequency:
+
+.. code-block:: python
+
+	Counter({u'to': 98, u'a': 90, u'the': 83, u'of': 50, u'karma': 50, .....
+
+and could be used to create a tag/word cloud for a post.
+
+Requirements
+============
+
+`post_stats` requires BeautifulSoup.
+
+.. code-block:: console
+
+    $ pip install beautifulsoup4

From f629e3b012ed4d49bce1b4721664af8f15ebd5c8 Mon Sep 17 00:00:00 2001
From: Duncan Lock <duncan.lock@gmail.com>
Date: Sun, 23 Jun 2013 21:24:20 -0700
Subject: [PATCH 6/6] Tidy up readme

---
 post_stats/readme.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/post_stats/readme.rst b/post_stats/readme.rst
index f607d3b..89fc13b 100644
--- a/post_stats/readme.rst
+++ b/post_stats/readme.rst
@@ -4,8 +4,8 @@ Post Statistics
 A Pelican plugin to calculate various statistics about a post and store them in an article.stats dictionary:
 
 - ``wc``: how many words
-- ``read_mins``: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
-- ``word_counts``: frquency count of all the words in the article; can be used for tag/word clouds/
+- ``read_mins``: how many minutes would it take to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
+- ``word_counts``: frquency count of all the words in the article; can be used for tag/word clouds
 - ``fi``: Flesch-kincaid Index/ Reading Ease (see: http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests)
 - ``fk``: Flesch-kincaid Grade Level
 
@@ -31,16 +31,16 @@ This allows you to output these values in your templates, like this, for example
 	    <li>Flesch-kincaid Grade Level: {{ article.stats['fk'] }}</li>
 	</ul>
 
-The ``word_counts`` variable is a Counter dictionary and looks like this, with each unique word and it's frequency:
+The ``word_counts`` variable is a python ``Counter`` dictionary and looks something like this, with each unique word and it's frequency:
 
 .. code-block:: python
 
 	Counter({u'to': 98, u'a': 90, u'the': 83, u'of': 50, u'karma': 50, .....
 
-and could be used to create a tag/word cloud for a post.
+and can be used to create a tag/word cloud for a post.
 
 Requirements
-============
+----------------
 
 `post_stats` requires BeautifulSoup.