From d38b4a802820a06f8201f300afbb2585f385f9dc Mon Sep 17 00:00:00 2001 From: Duncan Lock Date: Sun, 23 Jun 2013 02:29:08 -0700 Subject: [PATCH 1/6] Initial commit of Post Statistics plugin --- post_stats/__init__.py | 1 + post_stats/post_stats.py | 74 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 post_stats/__init__.py create mode 100644 post_stats/post_stats.py diff --git a/post_stats/__init__.py b/post_stats/__init__.py new file mode 100644 index 0000000..2313bd5 --- /dev/null +++ b/post_stats/__init__.py @@ -0,0 +1 @@ +from .post_stats import * diff --git a/post_stats/post_stats.py b/post_stats/post_stats.py new file mode 100644 index 0000000..7ab30af --- /dev/null +++ b/post_stats/post_stats.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +""" +Post Statistics +======================== + +This plugin calculates various Statistics about a post and stores them in an article.stats disctionary. + +wc: how many words +read_minutes: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension) +word_count: frquency count of all the words in the article; can be used for tag/word clouds/ + +""" + +from pelican import signals, contents + +# import nltk + +from bs4 import BeautifulSoup + +# import lxml.html +# from lxml.html.clean import Cleaner + +import re +from collections import Counter + + +def calculate_stats(instance): + + WPM = 250 + + if instance._content is not None: + stats = {} + content = instance._content + + # print content + entities = r'\&\#?.+?;' + content = content.replace(' ', ' ') + content = re.sub(entities, '', content) + # print content + + # Pre-process the text to remove punctuation + drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”' + content = content.translate(dict((ord(c), u'') for c in drop)) + + # nltk + # raw_text = nltk.clean_html(content) + + # BeautifulSoup + raw_text = BeautifulSoup(content).getText() + # raw_text = ''.join(BeautifulSoup(content).findAll(text=True)) + + # lxml + # cleaner = Cleaner(style=True) + # html = lxml.html.fromstring(content) + # raw_text = cleaner.clean_html(html).text_content() + + # stats['wc'] = len(re.findall(r'\b', raw_text)) >> 1 + + # print raw_text + + words = raw_text.lower().split() + word_count = Counter(words) + # print word_count + + stats['word_counts'] = word_count + stats['wc'] = sum(word_count.values()) + stats['read_minutes'] = (stats['wc'] + WPM // 2) // WPM + + instance.stats = stats + instance.raw_text = raw_text + + +def register(): + signals.content_object_init.connect(calculate_stats) From b49d8e8083f5d1d84cf8d3f75ebdddb942aa4ba3 Mon Sep 17 00:00:00 2001 From: Duncan Lock Date: Sun, 23 Jun 2013 10:54:54 -0700 Subject: [PATCH 2/6] Minor tidy. simplified read_minutes calculation --- post_stats/post_stats.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/post_stats/post_stats.py b/post_stats/post_stats.py index 7ab30af..f8a91f2 100644 --- a/post_stats/post_stats.py +++ b/post_stats/post_stats.py @@ -11,7 +11,8 @@ word_count: frquency count of all the words in the article; can be used for tag/ """ -from pelican import signals, contents +from pelican import signals +# import math # import nltk @@ -26,6 +27,7 @@ from collections import Counter def calculate_stats(instance): + # How fast do average people read? WPM = 250 if instance._content is not None: @@ -64,7 +66,8 @@ def calculate_stats(instance): stats['word_counts'] = word_count stats['wc'] = sum(word_count.values()) - stats['read_minutes'] = (stats['wc'] + WPM // 2) // WPM + # stats['read_minutes'] = math.ceil(float(stats['wc']) / float(WPM)) + stats['read_minutes'] = (stats['wc'] + WPM - 1) // WPM instance.stats = stats instance.raw_text = raw_text From 2dd13f3fa0ed33cba0ae12369bf470d1179a3573 Mon Sep 17 00:00:00 2001 From: Duncan Lock Date: Sun, 23 Jun 2013 11:04:46 -0700 Subject: [PATCH 3/6] Made read_minutes 1 minute min --- post_stats/post_stats.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/post_stats/post_stats.py b/post_stats/post_stats.py index f8a91f2..8270dfb 100644 --- a/post_stats/post_stats.py +++ b/post_stats/post_stats.py @@ -68,6 +68,8 @@ def calculate_stats(instance): stats['wc'] = sum(word_count.values()) # stats['read_minutes'] = math.ceil(float(stats['wc']) / float(WPM)) stats['read_minutes'] = (stats['wc'] + WPM - 1) // WPM + if stats['read_minutes'] == 0: + stats['read_minutes'] = 1 instance.stats = stats instance.raw_text = raw_text From 546d003682eb7e192f4d54f8d6272e42087ecedf Mon Sep 17 00:00:00 2001 From: Duncan Lock Date: Sun, 23 Jun 2013 14:25:10 -0700 Subject: [PATCH 4/6] Tidies up, removed commented out nltk & lxml alternative versions. --- post_stats/post_stats.py | 50 +++++++++++----------------------------- 1 file changed, 14 insertions(+), 36 deletions(-) diff --git a/post_stats/post_stats.py b/post_stats/post_stats.py index 8270dfb..6b38dd3 100644 --- a/post_stats/post_stats.py +++ b/post_stats/post_stats.py @@ -3,76 +3,54 @@ Post Statistics ======================== -This plugin calculates various Statistics about a post and stores them in an article.stats disctionary. +This plugin calculates various Statistics about a post and stores them in an article.stats disctionary: wc: how many words -read_minutes: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension) -word_count: frquency count of all the words in the article; can be used for tag/word clouds/ +read_mins: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension) +word_counts: frquency count of all the words in the article; can be used for tag/word clouds/ """ from pelican import signals -# import math - -# import nltk - from bs4 import BeautifulSoup - -# import lxml.html -# from lxml.html.clean import Cleaner - import re from collections import Counter def calculate_stats(instance): - # How fast do average people read? - WPM = 250 - if instance._content is not None: stats = {} content = instance._content - # print content + # How fast do average people read? + WPM = 250 + + # Pre-process the text to remove entities entities = r'\&\#?.+?;' content = content.replace(' ', ' ') content = re.sub(entities, '', content) - # print content # Pre-process the text to remove punctuation drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”' content = content.translate(dict((ord(c), u'') for c in drop)) - # nltk - # raw_text = nltk.clean_html(content) - - # BeautifulSoup + # Use BeautifulSoup to get readable/visible text raw_text = BeautifulSoup(content).getText() - # raw_text = ''.join(BeautifulSoup(content).findAll(text=True)) - - # lxml - # cleaner = Cleaner(style=True) - # html = lxml.html.fromstring(content) - # raw_text = cleaner.clean_html(html).text_content() - - # stats['wc'] = len(re.findall(r'\b', raw_text)) >> 1 - - # print raw_text + # Count the words in the text words = raw_text.lower().split() word_count = Counter(words) - # print word_count + # Return the stats stats['word_counts'] = word_count stats['wc'] = sum(word_count.values()) - # stats['read_minutes'] = math.ceil(float(stats['wc']) / float(WPM)) - stats['read_minutes'] = (stats['wc'] + WPM - 1) // WPM - if stats['read_minutes'] == 0: - stats['read_minutes'] = 1 + # Calulate how long it'll take to read, rounding up + stats['read_mins'] = (stats['wc'] + WPM - 1) // WPM + if stats['read_mins'] == 0: + stats['read_mins'] = 1 instance.stats = stats - instance.raw_text = raw_text def register(): From 69f7b1fb0a6c0fcf755e398d1e7b1ab96648c2b0 Mon Sep 17 00:00:00 2001 From: Duncan Lock Date: Sun, 23 Jun 2013 21:19:39 -0700 Subject: [PATCH 5/6] Added Flesch-Kincaid readability scores; added readme; final tidy --- post_stats/post_stats.py | 34 +++++++++++++++++------- post_stats/readability.py | 56 +++++++++++++++++++++++++++++++++++++++ post_stats/readme.rst | 49 ++++++++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+), 10 deletions(-) create mode 100644 post_stats/readability.py create mode 100644 post_stats/readme.rst diff --git a/post_stats/post_stats.py b/post_stats/post_stats.py index 6b38dd3..3f7c81e 100644 --- a/post_stats/post_stats.py +++ b/post_stats/post_stats.py @@ -3,11 +3,13 @@ Post Statistics ======================== -This plugin calculates various Statistics about a post and stores them in an article.stats disctionary: +This plugin calculates various statistics about a post and stores them in an article.stats dictionary: wc: how many words read_mins: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension) word_counts: frquency count of all the words in the article; can be used for tag/word clouds/ +fi: Flesch-kincaid Index/ Reading Ease +fk: Flesch-kincaid Grade Level """ @@ -16,6 +18,8 @@ from bs4 import BeautifulSoup import re from collections import Counter +from .readability import * + def calculate_stats(instance): @@ -26,18 +30,22 @@ def calculate_stats(instance): # How fast do average people read? WPM = 250 - # Pre-process the text to remove entities - entities = r'\&\#?.+?;' - content = content.replace(' ', ' ') - content = re.sub(entities, '', content) - - # Pre-process the text to remove punctuation - drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”' - content = content.translate(dict((ord(c), u'') for c in drop)) - # Use BeautifulSoup to get readable/visible text raw_text = BeautifulSoup(content).getText() + # Process the text to remove entities + entities = r'\&\#?.+?;' + raw_text = raw_text.replace(' ', ' ') + raw_text = re.sub(entities, '', raw_text) + + # Flesch-kincaid readbility stats counts sentances, + # so save before removing punctuation + tmp = raw_text + + # Process the text to remove punctuation + drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”' + raw_text = raw_text.translate(dict((ord(c), u'') for c in drop)) + # Count the words in the text words = raw_text.lower().split() word_count = Counter(words) @@ -45,11 +53,17 @@ def calculate_stats(instance): # Return the stats stats['word_counts'] = word_count stats['wc'] = sum(word_count.values()) + # Calulate how long it'll take to read, rounding up stats['read_mins'] = (stats['wc'] + WPM - 1) // WPM if stats['read_mins'] == 0: stats['read_mins'] = 1 + # Calculate Flesch-kincaid readbility stats + readability_stats = stcs, words, sbls = text_stats(tmp, stats['wc']) + stats['fi'] = "{:.2f}".format(flesch_index(readability_stats)) + stats['fk'] = "{:.2f}".format(flesch_kincaid_level(readability_stats)) + instance.stats = stats diff --git a/post_stats/readability.py b/post_stats/readability.py new file mode 100644 index 0000000..0cc1c71 --- /dev/null +++ b/post_stats/readability.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +# Adadpted from here: http://acdx.net/calculating-the-flesch-kincaid-level-in-python/ +# See here for details: http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_test + +from __future__ import division +import re + + +def mean(seq): + return sum(seq) / len(seq) + + +def syllables(word): + if len(word) <= 3: + return 1 + + word = re.sub(r"(es|ed|(?= 2, stcs) + + if wc: + words = wc + else: + words = sum(len(s) for s in stcs) + + sbls = sum(syllables(w) for s in stcs for w in s) + + return len(stcs), words, sbls + + +def flesch_index(stats): + stcs, words, sbls = stats + if stcs == 0 or words == 0: + return 0 + return 206.835 - 1.015 * (words / stcs) - 84.6 * (sbls / words) + + +def flesch_kincaid_level(stats): + stcs, words, sbls = stats + if stcs == 0 or words == 0: + return 0 + return 0.39 * (words / stcs) + 11.8 * (sbls / words) - 15.59 diff --git a/post_stats/readme.rst b/post_stats/readme.rst new file mode 100644 index 0000000..f607d3b --- /dev/null +++ b/post_stats/readme.rst @@ -0,0 +1,49 @@ +Post Statistics +================== + +A Pelican plugin to calculate various statistics about a post and store them in an article.stats dictionary: + +- ``wc``: how many words +- ``read_mins``: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension) +- ``word_counts``: frquency count of all the words in the article; can be used for tag/word clouds/ +- ``fi``: Flesch-kincaid Index/ Reading Ease (see: http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests) +- ``fk``: Flesch-kincaid Grade Level + +Example: + +.. code-block:: python + + { + 'wc': 2760, + 'fi': '65.94', + 'fk': '7.65', + 'word_counts': Counter({u'to': 98, u'a': 90, u'the': 83, u'of': 50, ...}), + 'read_mins': 12 + } + +This allows you to output these values in your templates, like this, for example: + +.. code-block:: html+jinja + +

~{{ article.stats['read_mins'] }} min read

+
    +
  • Flesch-kincaid Index/ Reading Ease: {{ article.stats['fi'] }}
  • +
  • Flesch-kincaid Grade Level: {{ article.stats['fk'] }}
  • +
+ +The ``word_counts`` variable is a Counter dictionary and looks like this, with each unique word and it's frequency: + +.. code-block:: python + + Counter({u'to': 98, u'a': 90, u'the': 83, u'of': 50, u'karma': 50, ..... + +and could be used to create a tag/word cloud for a post. + +Requirements +============ + +`post_stats` requires BeautifulSoup. + +.. code-block:: console + + $ pip install beautifulsoup4 From f629e3b012ed4d49bce1b4721664af8f15ebd5c8 Mon Sep 17 00:00:00 2001 From: Duncan Lock Date: Sun, 23 Jun 2013 21:24:20 -0700 Subject: [PATCH 6/6] Tidy up readme --- post_stats/readme.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/post_stats/readme.rst b/post_stats/readme.rst index f607d3b..89fc13b 100644 --- a/post_stats/readme.rst +++ b/post_stats/readme.rst @@ -4,8 +4,8 @@ Post Statistics A Pelican plugin to calculate various statistics about a post and store them in an article.stats dictionary: - ``wc``: how many words -- ``read_mins``: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension) -- ``word_counts``: frquency count of all the words in the article; can be used for tag/word clouds/ +- ``read_mins``: how many minutes would it take to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension) +- ``word_counts``: frquency count of all the words in the article; can be used for tag/word clouds - ``fi``: Flesch-kincaid Index/ Reading Ease (see: http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests) - ``fk``: Flesch-kincaid Grade Level @@ -31,16 +31,16 @@ This allows you to output these values in your templates, like this, for example
  • Flesch-kincaid Grade Level: {{ article.stats['fk'] }}
  • -The ``word_counts`` variable is a Counter dictionary and looks like this, with each unique word and it's frequency: +The ``word_counts`` variable is a python ``Counter`` dictionary and looks something like this, with each unique word and it's frequency: .. code-block:: python Counter({u'to': 98, u'a': 90, u'the': 83, u'of': 50, u'karma': 50, ..... -and could be used to create a tag/word cloud for a post. +and can be used to create a tag/word cloud for a post. Requirements -============ +---------------- `post_stats` requires BeautifulSoup.