-
Notifications
You must be signed in to change notification settings - Fork 0
/
indexer.php
119 lines (101 loc) · 3.05 KB
/
indexer.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
<?php
/**
* Created by PhpStorm.
* User: gabrielgagno
* Date: 1/19/16
* Time: 6:07 PM
*/
namespace Parser;
include('MetaParser.php');
use Parser\MetaParser;
$rds_settings = array(
"host" => "",
"db" => "",
"user" => "",
"password" => ""
);
$es_settings = array(
"host" => "",
"index" => ""
);
# save rds settings
$rds_file = fopen('rds_location', 'r');
echo "UPDATING RDS CREDENTIALS\n";
if(!$rds_file) {
die('ERROR: RDS CONFIG FILE NOT FOUND\n');
}
while(!feof($rds_file)) {
$row = fgets($rds_file);
$tok = strtok($row, "=\n");
$rds_settings[$tok] = strtok("=\n");
}
fclose($rds_file);
echo "SUCCESSFULLY CONFIGURED RDS SETTINGS\n";
#save es settings
echo "UPDATING ES CREDENTIALS\n";
$es_file = fopen('es_location', 'r');
if(!$es_file) {
die('ERROR: ES CONFIG FILE NOT FOUND\n');
}
while(!feof($es_file)) {
$row = fgets($es_file);
$tok = strtok($row, "=\n");
$es_settings[$tok] = strtok("=\n");
}
echo "SUCCESSFULLY CONFIGURED ES SETTINGS\n";
echo "CONNECTING TO MYSQL DATABASE...\n";
$conn = mysqli_connect(
$rds_settings['host'],
$rds_settings['user'],
$rds_settings['password'],
$rds_settings['db']);
if(!$conn) {
die('Connection failed : ' . mysqli_error($conn));
}
echo "SUCCESSFULLY CONNECTED TO MYSQL DATABASE\n";
$conn->autocommit("true");
echo "AUTOCOMMIT SET TO TRUE\n";
$query = "select id, title, baseUrl, text, content from webpage where status = 2";
$results = $conn->query($query);
#bulk index using curl
$curlUrl = "http://".$es_settings['host'].":".$es_settings['port']."/".$es_settings['index']."/webpage/_bulk";
echo "CURL URL: ".$curlUrl;
$ch = curl_init();
# initialize curl payload
$payload = "";
$rowNum = 1;
$rowCount = $results->num_rows;
# for all results of query
while($row = mysqli_fetch_assoc($results)) {
echo "DOCUMENT NUMBER: ".$rowNum."\n";
$metas = MetaParser::parseMetaTagsFromHtmlString($row['content'], ['description', 'keywords']);
$payloadHalf = json_encode(array(
"id" => utf8_encode($row['id']),
"title" => utf8_encode($row['title']),
"url" => utf8_encode($row['baseUrl']),
"content" => utf8_encode($row['text']),
"desc" => empty($metas['description'])?null:utf8_encode($metas['description']),
"keywords" => empty($metas['keywords'])?null:utf8_encode($metas['keywords']),
"subdomain" => MetaParser::getSubdomain($row['baseUrl'])
), JSON_UNESCAPED_SLASHES);
$payload = $payload."{\"create\":{}}\n".$payloadHalf."\n";
if($rowNum%200 == 0 || $rowNum == $rowCount) {
curl_setopt_array($ch, array(
CURLOPT_CUSTOMREQUEST => 'POST',
CURLOPT_URL => $curlUrl,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_POSTFIELDS => $payload
));
echo "\nindexing job: starting...\n";
#execute bulk index for 200 documents
$response = curl_exec($ch);
echo $response;
echo "indexing job: done.\n";
$payload = "";
}
$rowNum++;
}
curl_close($ch);
# close the connection
$conn->close();
echo "DONE\n";