# -*- coding: utf-8 -*-
"""
Tipue Search
============
A Pelican plugin to serialize generated HTML to JSON
that can be used by jQuery plugin - Tipue Search.
Copyright (c) Talha Mansoor
"""
from __future__ import unicode_literals
import os.path
import json
import re
from bs4 import BeautifulSoup
from codecs import open
try:
from urlparse import urljoin
except ImportError:
from urllib.parse import urljoin
from pelican import signals
class Tipue_Search_JSON_Generator(object):
def __init__(self, context, settings, path, theme, output_path, *null):
self.output_path = output_path
self.context = context
self.siteurl = settings.get('SITEURL')
self.relative_urls = settings.get('RELATIVE_URLS')
self.tpages = settings.get('TEMPLATE_PAGES')
self.tstatic = settings.get('THEME_STATIC_DIR')
self.output_path = output_path
self.json_nodes = []
def normalize(self, s):
replacements = (
("á", "a"),
("é", "e"),
("í", "i"),
("ó", "o"),
("ú", "u"),
(".", ""),
)
s = s.lower()
for a, b in replacements:
s = s.replace(a, b).replace(a.lower(), b.lower())
s = re.sub(r"([a-z]) ([a-z])", r"\1-\2", s, 0,
re.IGNORECASE | re.DOTALL)
return s
def create_json_node(self, article):
if getattr(article, 'status', 'published') != 'published':
return
soup_title = BeautifulSoup(
article.title.replace(' ', ' '), 'html.parser')
video_title = soup_title.get_text(' ', strip=True).replace(
'“', '"').replace(
'”', '"').replace(
'’', "'").replace('^', '^')
# description
art_desc = BeautifulSoup(article.content, 'html.parser')
# fix ignore
inside description
try:
art_desc = art_desc.find('figure').find_all_next('p')
art_desc_html = ''.join(map(str, art_desc))
art_desc = BeautifulSoup(art_desc_html, 'html.parser')
video_desc_html = art_desc_html.replace('\n', ' ')
except:
video_desc_html = ''.join(
map(str, art_desc)).replace('\n', ' ')
pass
video_desc_text = art_desc.get_text(' ', strip=True).replace(
'“', '"').replace(
'”', '"').replace(
'’', "'").replace(
'¶', ' ').replace('^', '^')
video_desc_text = ' '.join(video_desc_text.split())
# base url
if self.relative_urls:
base_url = '.'
else:
base_url = self.siteurl
# videoid
video_id = str(article.videoid) if getattr(
article, 'videoid', 'None') != 'None' else ''
# thumbnail
video_image = article.image if getattr(
article, 'image', 'None') != 'None' else ''
url_image = "%s/%s/../wp-content/uploads/article/thumbnail/%s" % (
base_url, self.tstatic, video_image
)
# publish
video_publish = article.date.isoformat() if getattr(
article, 'date', 'None') != 'None' else ''
# publish_text
video_publish_text = article.date.strftime("%a, %d %B, %Y") if getattr(
article, 'date', 'None') != 'None' else ''
# author
video_author = str(article.author) if getattr(
article, 'author', 'None') != 'None' else ''
# author url
video_author_url = "%s/author/%s/" % (
base_url, self.normalize(video_author)
)
# time
video_time = article.time if getattr(
article, 'time', 'None') != 'None' else ''
video_url = '.'
if article.url:
video_url = article.url if self.relative_urls else (
self.siteurl + '/' + article.url)
video_src = article.og_video if getattr(
article, 'og_video', 'None') != 'None' else ''
# category
video_category = article.category.name if getattr(
article, 'category', 'None') != 'None' else ''
# tags
data_tags = ['%s' % (tag) for tag in article.tags]
video_tags = dict((num, tag) for num, tag in enumerate(data_tags))
node = {
'videoId': video_id,
'title': video_title,
'description': video_desc_text,
'descriptionHtml': video_desc_html,
'videoThumbnail': url_image,
'formatStreams': {
'url': video_src,
},
'author': video_author,
'authorUrl': video_author_url,
'published': video_publish,
'publishedText': video_publish_text,
'time': video_time,
'category': video_category,
'keywords': video_tags,
'url': video_url
}
self.json_nodes.append(node)
def create_tpage_node(self, srclink):
srcfile = open(os.path.join(self.output_path,
self.tpages[srclink]),
encoding='utf-8')
soup = BeautifulSoup(srcfile, 'html.parser')
video_title = soup.title.string if soup.title is not None else ''
video_text = soup.get_text()
# Should set default category
video_category = ''
video_url = urljoin(self.siteurl, self.tpages[srclink])
node = {'title': video_title,
'text': video_text,
'tags': video_category,
'url': video_url}
self.json_nodes.append(node)
def generate_output(self, writer):
path = os.path.join(self.output_path, 'tipuesearch_content.json')
articles = self.context['articles']
for article in self.context['articles']:
articles += article.translations
for srclink in self.tpages:
self.create_tpage_node(srclink)
for article in articles:
self.create_json_node(article)
root_node = {'videos': self.json_nodes}
with open(path, 'w', encoding='utf-8') as fd:
json.dump(root_node, fd, separators=(',', ':'), ensure_ascii=False)
def get_generators(generators):
return Tipue_Search_JSON_Generator
def register():
signals.get_generators.connect(get_generators)