# -*- coding: utf-8 -*- """ Tipue Search ============ A Pelican plugin to serialize generated HTML to JSON that can be used by jQuery plugin - Tipue Search. Copyright (c) Talha Mansoor """ from __future__ import unicode_literals import os.path import json import re from bs4 import BeautifulSoup from codecs import open try: from urlparse import urljoin except ImportError: from urllib.parse import urljoin from pelican import signals class Tipue_Search_JSON_Generator(object): def __init__(self, context, settings, path, theme, output_path, *null): self.output_path = output_path self.context = context self.siteurl = settings.get('SITEURL') self.relative_urls = settings.get('RELATIVE_URLS') self.tpages = settings.get('TEMPLATE_PAGES') self.tstatic = settings.get('THEME_STATIC_DIR') self.output_path = output_path self.json_nodes = [] def normalize(self, s): replacements = ( ("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"), (".", ""), ) s = s.lower() for a, b in replacements: s = s.replace(a, b).replace(a.lower(), b.lower()) s = re.sub(r"([a-z]) ([a-z])", r"\1-\2", s, 0, re.IGNORECASE | re.DOTALL) return s def create_json_node(self, article): if getattr(article, 'status', 'published') != 'published': return soup_title = BeautifulSoup( article.title.replace(' ', ' '), 'html.parser') video_title = soup_title.get_text(' ', strip=True).replace( '“', '"').replace( '”', '"').replace( '’', "'").replace('^', '^') # description art_desc = BeautifulSoup(article.content, 'html.parser') # fix ignore

inside
description try: art_desc = art_desc.find('figure').find_all_next('p') art_desc_html = ''.join(map(str, art_desc)) art_desc = BeautifulSoup(art_desc_html, 'html.parser') video_desc_html = art_desc_html.replace('\n', ' ') except: video_desc_html = ''.join( map(str, art_desc)).replace('\n', ' ') pass video_desc_text = art_desc.get_text(' ', strip=True).replace( '“', '"').replace( '”', '"').replace( '’', "'").replace( '¶', ' ').replace('^', '^') video_desc_text = ' '.join(video_desc_text.split()) # base url if self.relative_urls: base_url = '.' else: base_url = self.siteurl # videoid video_id = str(article.videoid) if getattr( article, 'videoid', 'None') != 'None' else '' # thumbnail video_image = article.image if getattr( article, 'image', 'None') != 'None' else '' url_image = "%s/%s/../wp-content/uploads/article/thumbnail/%s" % ( base_url, self.tstatic, video_image ) # publish video_publish = article.date.isoformat() if getattr( article, 'date', 'None') != 'None' else '' # publish_text video_publish_text = article.date.strftime("%a, %d %B, %Y") if getattr( article, 'date', 'None') != 'None' else '' # author video_author = str(article.author) if getattr( article, 'author', 'None') != 'None' else '' # author url video_author_url = "%s/author/%s/" % ( base_url, self.normalize(video_author) ) # time video_time = article.time if getattr( article, 'time', 'None') != 'None' else '' video_url = '.' if article.url: video_url = article.url if self.relative_urls else ( self.siteurl + '/' + article.url) video_src = article.og_video if getattr( article, 'og_video', 'None') != 'None' else '' # category video_category = article.category.name if getattr( article, 'category', 'None') != 'None' else '' # tags data_tags = ['%s' % (tag) for tag in article.tags] video_tags = dict((num, tag) for num, tag in enumerate(data_tags)) node = { 'videoId': video_id, 'title': video_title, 'description': video_desc_text, 'descriptionHtml': video_desc_html, 'videoThumbnail': url_image, 'formatStreams': { 'url': video_src, }, 'author': video_author, 'authorUrl': video_author_url, 'published': video_publish, 'publishedText': video_publish_text, 'time': video_time, 'category': video_category, 'keywords': video_tags, 'url': video_url } self.json_nodes.append(node) def create_tpage_node(self, srclink): srcfile = open(os.path.join(self.output_path, self.tpages[srclink]), encoding='utf-8') soup = BeautifulSoup(srcfile, 'html.parser') video_title = soup.title.string if soup.title is not None else '' video_text = soup.get_text() # Should set default category video_category = '' video_url = urljoin(self.siteurl, self.tpages[srclink]) node = {'title': video_title, 'text': video_text, 'tags': video_category, 'url': video_url} self.json_nodes.append(node) def generate_output(self, writer): path = os.path.join(self.output_path, 'tipuesearch_content.json') articles = self.context['articles'] for article in self.context['articles']: articles += article.translations for srclink in self.tpages: self.create_tpage_node(srclink) for article in articles: self.create_json_node(article) root_node = {'videos': self.json_nodes} with open(path, 'w', encoding='utf-8') as fd: json.dump(root_node, fd, separators=(',', ':'), ensure_ascii=False) def get_generators(generators): return Tipue_Search_JSON_Generator def register(): signals.get_generators.connect(get_generators)