diff options
author | srv <enmanuel.saravia.externo@pandero.com.pe> | 2025-04-28 17:11:28 -0500 |
---|---|---|
committer | srv <enmanuel.saravia.externo@pandero.com.pe> | 2025-04-28 17:11:28 -0500 |
commit | f35a7b0e70032de2feec9f3bda09da44cf0e1073 (patch) | |
tree | 1e0e09581dd3707d0ceb93346452dd14451a8423 /plugins/tipue-search/tipue_search.py |
first commit
Diffstat (limited to 'plugins/tipue-search/tipue_search.py')
-rw-r--r-- | plugins/tipue-search/tipue_search.py | 212 |
1 files changed, 212 insertions, 0 deletions
diff --git a/plugins/tipue-search/tipue_search.py b/plugins/tipue-search/tipue_search.py new file mode 100644 index 0000000..19ef68f --- /dev/null +++ b/plugins/tipue-search/tipue_search.py @@ -0,0 +1,212 @@ +# -*- coding: utf-8 -*- +""" +Tipue Search +============ + +A Pelican plugin to serialize generated HTML to JSON +that can be used by jQuery plugin - Tipue Search. + +Copyright (c) Talha Mansoor +""" + +from __future__ import unicode_literals + +import os.path +import json +import re +from bs4 import BeautifulSoup +from codecs import open +try: + from urlparse import urljoin +except ImportError: + from urllib.parse import urljoin + +from pelican import signals + + +class Tipue_Search_JSON_Generator(object): + + def __init__(self, context, settings, path, theme, output_path, *null): + + self.output_path = output_path + self.context = context + self.siteurl = settings.get('SITEURL') + self.relative_urls = settings.get('RELATIVE_URLS') + self.tpages = settings.get('TEMPLATE_PAGES') + self.tstatic = settings.get('THEME_STATIC_DIR') + self.output_path = output_path + self.json_nodes = [] + + def normalize(self, s): + replacements = ( + ("á", "a"), + ("é", "e"), + ("í", "i"), + ("ó", "o"), + ("ú", "u"), + (".", ""), + ) + s = s.lower() + for a, b in replacements: + s = s.replace(a, b).replace(a.lower(), b.lower()) + + s = re.sub(r"([a-z]) ([a-z])", r"\1-\2", s, 0, + re.IGNORECASE | re.DOTALL) + return s + + def create_json_node(self, article): + + if getattr(article, 'status', 'published') != 'published': + return + + soup_title = BeautifulSoup( + article.title.replace(' ', ' '), 'html.parser') + video_title = soup_title.get_text(' ', strip=True).replace( + '“', '"').replace( + '”', '"').replace( + '’', "'").replace('^', '^') + + # description + art_desc = BeautifulSoup(article.content, 'html.parser') + + # fix ignore <h1> inside <figure> description + try: + art_desc = art_desc.find('figure').find_all_next('p') + art_desc_html = ''.join(map(str, art_desc)) + art_desc = BeautifulSoup(art_desc_html, 'html.parser') + video_desc_html = art_desc_html.replace('\n', ' ') + except: + video_desc_html = ''.join( + map(str, art_desc)).replace('\n', ' ') + pass + + video_desc_text = art_desc.get_text(' ', strip=True).replace( + '“', '"').replace( + '”', '"').replace( + '’', "'").replace( + '¶', ' ').replace('^', '^') + + video_desc_text = ' '.join(video_desc_text.split()) + + # base url + if self.relative_urls: + base_url = '.' + else: + base_url = self.siteurl + + # videoid + video_id = str(article.videoid) if getattr( + article, 'videoid', 'None') != 'None' else '' + + # thumbnail + video_image = article.image if getattr( + article, 'image', 'None') != 'None' else '' + + url_image = "%s/%s/../wp-content/uploads/article/thumbnail/%s" % ( + base_url, self.tstatic, video_image + ) + + # publish + video_publish = article.date.isoformat() if getattr( + article, 'date', 'None') != 'None' else '' + + # publish_text + video_publish_text = article.date.strftime("%a, %d %B, %Y") if getattr( + article, 'date', 'None') != 'None' else '' + + # author + video_author = str(article.author) if getattr( + article, 'author', 'None') != 'None' else '' + + # author url + video_author_url = "%s/author/%s/" % ( + base_url, self.normalize(video_author) + ) + + # time + video_time = article.time if getattr( + article, 'time', 'None') != 'None' else '' + + video_url = '.' + if article.url: + video_url = article.url if self.relative_urls else ( + self.siteurl + '/' + article.url) + + video_src = article.og_video if getattr( + article, 'og_video', 'None') != 'None' else '' + + # category + video_category = article.category.name if getattr( + article, 'category', 'None') != 'None' else '' + + # tags + data_tags = ['%s' % (tag) for tag in article.tags] + video_tags = dict((num, tag) for num, tag in enumerate(data_tags)) + + node = { + 'videoId': video_id, + 'title': video_title, + 'description': video_desc_text, + 'descriptionHtml': video_desc_html, + 'videoThumbnail': url_image, + 'formatStreams': { + 'url': video_src, + }, + 'author': video_author, + 'authorUrl': video_author_url, + 'published': video_publish, + 'publishedText': video_publish_text, + 'time': video_time, + 'category': video_category, + 'keywords': video_tags, + 'url': video_url + } + + self.json_nodes.append(node) + + def create_tpage_node(self, srclink): + + srcfile = open(os.path.join(self.output_path, + self.tpages[srclink]), + encoding='utf-8') + soup = BeautifulSoup(srcfile, 'html.parser') + video_title = soup.title.string if soup.title is not None else '' + video_text = soup.get_text() + + # Should set default category + video_category = '' + video_url = urljoin(self.siteurl, self.tpages[srclink]) + + node = {'title': video_title, + 'text': video_text, + 'tags': video_category, + 'url': video_url} + + self.json_nodes.append(node) + + def generate_output(self, writer): + path = os.path.join(self.output_path, 'tipuesearch_content.json') + + articles = self.context['articles'] + + for article in self.context['articles']: + articles += article.translations + + for srclink in self.tpages: + self.create_tpage_node(srclink) + + for article in articles: + self.create_json_node(article) + + root_node = {'videos': self.json_nodes} + + with open(path, 'w', encoding='utf-8') as fd: + json.dump(root_node, fd, separators=(',', ':'), ensure_ascii=False) + + +def get_generators(generators): + return Tipue_Search_JSON_Generator + + +def register(): + signals.get_generators.connect(get_generators) |