From 8d0e643637d349cc5407b3b12b796f0331398444 Mon Sep 17 00:00:00 2001 From: bas smit Date: Fri, 24 May 2013 11:12:51 +0200 Subject: [PATCH] Explicitly set the html parser to make sure no extra tags get added. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BeautifulSoup supports multiple html parsers. Some of those parsers try to make the html valid by adding/removing tags[1]. This can lead to useless html, head & body tags in the final document. By explicitly setting the parser to ’html.parser’ this behaviour can be avoided. [1] http://www.crummy.com/software/BeautifulSoup/bs4/doc/#differences-between-parsers --- extract_toc/extract_toc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract_toc/extract_toc.py b/extract_toc/extract_toc.py index 479970b..a3a7e00 100644 --- a/extract_toc/extract_toc.py +++ b/extract_toc/extract_toc.py @@ -14,7 +14,7 @@ from pelican import signals, readers, contents def extract_toc(content): if isinstance(content, contents.Static): return - soup = BeautifulSoup(content._content) + soup = BeautifulSoup(content._content,'html.parser') filename = content.source_path extension = path.splitext(filename)[1][1:] toc = ''