BeautifulSoup supports multiple html parsers. Some of those parsers try to make the html valid by adding/removing tags[1]. This can lead to useless html, head & body tags in the final document. By explicitly setting the parser to ’html.parser’ this behaviour can be avoided. [1] http://www.crummy.com/software/BeautifulSoup/bs4/doc/#differences-between-parsers
35 lines
984 B
Python
35 lines
984 B
Python
"""
|
|
Extract Table of Content
|
|
========================
|
|
|
|
This plugin allows you to extract table of contents (ToC) from article.content
|
|
and place it in its own article.toc variable.
|
|
"""
|
|
|
|
from os import path
|
|
from bs4 import BeautifulSoup
|
|
from pelican import signals, readers, contents
|
|
|
|
|
|
def extract_toc(content):
|
|
if isinstance(content, contents.Static):
|
|
return
|
|
soup = BeautifulSoup(content._content,'html.parser')
|
|
filename = content.source_path
|
|
extension = path.splitext(filename)[1][1:]
|
|
toc = ''
|
|
# if it is a Markdown file
|
|
if extension in readers.MarkdownReader.file_extensions:
|
|
toc = soup.find('div', class_='toc')
|
|
# else if it is a reST file
|
|
elif extension in readers.RstReader.file_extensions:
|
|
toc = soup.find('div', class_='contents topic')
|
|
if toc:
|
|
toc.extract()
|
|
content._content = soup.decode()
|
|
content.toc = toc.decode()
|
|
|
|
|
|
def register():
|
|
signals.content_object_init.connect(extract_toc)
|