1
post_stats/__init__.py
Normal file
1
post_stats/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from .post_stats import *
|
||||||
71
post_stats/post_stats.py
Normal file
71
post_stats/post_stats.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Post Statistics
|
||||||
|
========================
|
||||||
|
|
||||||
|
This plugin calculates various statistics about a post and stores them in an article.stats dictionary:
|
||||||
|
|
||||||
|
wc: how many words
|
||||||
|
read_mins: how many minutes to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
|
||||||
|
word_counts: frquency count of all the words in the article; can be used for tag/word clouds/
|
||||||
|
fi: Flesch-kincaid Index/ Reading Ease
|
||||||
|
fk: Flesch-kincaid Grade Level
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pelican import signals
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
from .readability import *
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_stats(instance):
|
||||||
|
|
||||||
|
if instance._content is not None:
|
||||||
|
stats = {}
|
||||||
|
content = instance._content
|
||||||
|
|
||||||
|
# How fast do average people read?
|
||||||
|
WPM = 250
|
||||||
|
|
||||||
|
# Use BeautifulSoup to get readable/visible text
|
||||||
|
raw_text = BeautifulSoup(content).getText()
|
||||||
|
|
||||||
|
# Process the text to remove entities
|
||||||
|
entities = r'\&\#?.+?;'
|
||||||
|
raw_text = raw_text.replace(' ', ' ')
|
||||||
|
raw_text = re.sub(entities, '', raw_text)
|
||||||
|
|
||||||
|
# Flesch-kincaid readbility stats counts sentances,
|
||||||
|
# so save before removing punctuation
|
||||||
|
tmp = raw_text
|
||||||
|
|
||||||
|
# Process the text to remove punctuation
|
||||||
|
drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
|
||||||
|
raw_text = raw_text.translate(dict((ord(c), u'') for c in drop))
|
||||||
|
|
||||||
|
# Count the words in the text
|
||||||
|
words = raw_text.lower().split()
|
||||||
|
word_count = Counter(words)
|
||||||
|
|
||||||
|
# Return the stats
|
||||||
|
stats['word_counts'] = word_count
|
||||||
|
stats['wc'] = sum(word_count.values())
|
||||||
|
|
||||||
|
# Calulate how long it'll take to read, rounding up
|
||||||
|
stats['read_mins'] = (stats['wc'] + WPM - 1) // WPM
|
||||||
|
if stats['read_mins'] == 0:
|
||||||
|
stats['read_mins'] = 1
|
||||||
|
|
||||||
|
# Calculate Flesch-kincaid readbility stats
|
||||||
|
readability_stats = stcs, words, sbls = text_stats(tmp, stats['wc'])
|
||||||
|
stats['fi'] = "{:.2f}".format(flesch_index(readability_stats))
|
||||||
|
stats['fk'] = "{:.2f}".format(flesch_kincaid_level(readability_stats))
|
||||||
|
|
||||||
|
instance.stats = stats
|
||||||
|
|
||||||
|
|
||||||
|
def register():
|
||||||
|
signals.content_object_init.connect(calculate_stats)
|
||||||
56
post_stats/readability.py
Normal file
56
post_stats/readability.py
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Adadpted from here: http://acdx.net/calculating-the-flesch-kincaid-level-in-python/
|
||||||
|
# See here for details: http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_test
|
||||||
|
|
||||||
|
from __future__ import division
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def mean(seq):
|
||||||
|
return sum(seq) / len(seq)
|
||||||
|
|
||||||
|
|
||||||
|
def syllables(word):
|
||||||
|
if len(word) <= 3:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
word = re.sub(r"(es|ed|(?<!l)e)$", "", word)
|
||||||
|
return len(re.findall(r"[aeiouy]+", word))
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(text):
|
||||||
|
terminators = ".!?:;"
|
||||||
|
term = re.escape(terminators)
|
||||||
|
text = re.sub(r"[^%s\sA-Za-z]+" % term, "", text)
|
||||||
|
text = re.sub(r"\s*([%s]+\s*)+" % term, ". ", text)
|
||||||
|
return re.sub(r"\s+", " ", text)
|
||||||
|
|
||||||
|
|
||||||
|
def text_stats(text, wc):
|
||||||
|
text = normalize(text)
|
||||||
|
stcs = [s.split(" ") for s in text.split(". ")]
|
||||||
|
stcs = filter(lambda s: len(s) >= 2, stcs)
|
||||||
|
|
||||||
|
if wc:
|
||||||
|
words = wc
|
||||||
|
else:
|
||||||
|
words = sum(len(s) for s in stcs)
|
||||||
|
|
||||||
|
sbls = sum(syllables(w) for s in stcs for w in s)
|
||||||
|
|
||||||
|
return len(stcs), words, sbls
|
||||||
|
|
||||||
|
|
||||||
|
def flesch_index(stats):
|
||||||
|
stcs, words, sbls = stats
|
||||||
|
if stcs == 0 or words == 0:
|
||||||
|
return 0
|
||||||
|
return 206.835 - 1.015 * (words / stcs) - 84.6 * (sbls / words)
|
||||||
|
|
||||||
|
|
||||||
|
def flesch_kincaid_level(stats):
|
||||||
|
stcs, words, sbls = stats
|
||||||
|
if stcs == 0 or words == 0:
|
||||||
|
return 0
|
||||||
|
return 0.39 * (words / stcs) + 11.8 * (sbls / words) - 15.59
|
||||||
49
post_stats/readme.rst
Normal file
49
post_stats/readme.rst
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
Post Statistics
|
||||||
|
==================
|
||||||
|
|
||||||
|
A Pelican plugin to calculate various statistics about a post and store them in an article.stats dictionary:
|
||||||
|
|
||||||
|
- ``wc``: how many words
|
||||||
|
- ``read_mins``: how many minutes would it take to read this article, based on 250 wpm (http://en.wikipedia.org/wiki/Words_per_minute#Reading_and_comprehension)
|
||||||
|
- ``word_counts``: frquency count of all the words in the article; can be used for tag/word clouds
|
||||||
|
- ``fi``: Flesch-kincaid Index/ Reading Ease (see: http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests)
|
||||||
|
- ``fk``: Flesch-kincaid Grade Level
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
{
|
||||||
|
'wc': 2760,
|
||||||
|
'fi': '65.94',
|
||||||
|
'fk': '7.65',
|
||||||
|
'word_counts': Counter({u'to': 98, u'a': 90, u'the': 83, u'of': 50, ...}),
|
||||||
|
'read_mins': 12
|
||||||
|
}
|
||||||
|
|
||||||
|
This allows you to output these values in your templates, like this, for example:
|
||||||
|
|
||||||
|
.. code-block:: html+jinja
|
||||||
|
|
||||||
|
<p title="~{{ article.stats['wc'] }} words">~{{ article.stats['read_mins'] }} min read</p>
|
||||||
|
<ul>
|
||||||
|
<li>Flesch-kincaid Index/ Reading Ease: {{ article.stats['fi'] }}</li>
|
||||||
|
<li>Flesch-kincaid Grade Level: {{ article.stats['fk'] }}</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
The ``word_counts`` variable is a python ``Counter`` dictionary and looks something like this, with each unique word and it's frequency:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
Counter({u'to': 98, u'a': 90, u'the': 83, u'of': 50, u'karma': 50, .....
|
||||||
|
|
||||||
|
and can be used to create a tag/word cloud for a post.
|
||||||
|
|
||||||
|
Requirements
|
||||||
|
----------------
|
||||||
|
|
||||||
|
`post_stats` requires BeautifulSoup.
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ pip install beautifulsoup4
|
||||||
Reference in New Issue
Block a user