-
Notifications
You must be signed in to change notification settings - Fork 60
/
Copy pathcreate_search_index.py
executable file
·74 lines (63 loc) · 2.24 KB
/
create_search_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Script creates search index for Tipue Search 7.0
# Check http://www.tipue.com/search/help/ for more info
import json
import os
from bs4 import BeautifulSoup
# Takes Hugo public directory and returns all html files
def walker(path):
pages = []
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith('.html'):
pages.append('/'.join((root, file)))
return pages
# Takes html page and outputs json object
def parser(page):
soup = BeautifulSoup(open(page, 'r'), 'html.parser')
node = {}
try:
node['title'] = soup.title.get_text(' ', strip=True).replace(' ', ' ').replace('^', '^')
node['url'] = soup.link['href']
node['text'] = soup.article.get_text(' ', strip=True).replace('^', '^')
tags = []
for a in soup.find("p", class_="post-meta").find_all("a"):
tags.append(a['href'].split('/')[-1])
node['tags'] = ' '.join(tags)
return node
except Exception as e:
#print(e)
return None
# Json accumulator
def jsoner(nodes):
jdata = {'pages': nodes}
output = json.dumps(jdata)
output = 'var tipuesearch = ' + output + ';'
# This is hardcoded http://www.tipue.com/search/help/?d=2
with open('public/tipuesearch/tipuesearch_content.js', 'w') as f:
f.write(output)
# Sitemap generation
def sitemaper(nodes):
xml = '''<?xml version="1.0" encoding="utf-8"?>
<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n'''
url = '<url><loc>{0}</loc><changefreq>daily</changefreq><priority>0.5</priority></url>\n'
for n in nodes:
xml = xml + url.format(n['url'])
xml = xml + '\n</urlset>'
with open('public/search/sitemap.xml', 'w') as f:
f.write(xml)
if os.path.exists('./public/tipuesearch'):
pages = walker('.')
nodes = []
for p in pages:
node = parser(p)
if node:
nodes.append(node)
jsoner(nodes)
sitemaper(nodes)
else:
print('Error: place this script in hugo site root')