forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDates.h
830 lines (667 loc) · 25.1 KB
/
Dates.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
#ifndef _DATES_H_
#define _DATES_H_
#include "gb-include.h"
#include "XmlNode.h" // nodeid_t
#include "Bits.h"
#include "HashTableX.h"
// now address uses these
time_t getYearMonthStart ( int32_t y , int32_t m ) ;
time_t getDOWStart ( int32_t y , int32_t m, int32_t dowArg, int32_t count ) ;
int32_t getNumDaysInMonth ( int32_t month , int32_t year ) ; // leap year?
// dow is 0 to 6
char *getDOWName ( int32_t dow ) ;
// month is 0 to 11
char *getMonthName ( int32_t month ) ;
typedef int64_t dateflags_t;
// . values for Date::m_flags
// . these are of type dateflags_t
// . pubdate flags
// . is it a clock?
#define DF_CLOCK 0x0000000001
#define DF_NOTCLOCK 0x0000000002
// . is it > 25 hrs in the future
#define DF_FUTURE 0x0000000004
// this means we do not have a year, like "9/16" (year taken from spider date)
//#define DF_NOYEAR 0x0000000008
// where we got the date from
#define DF_FROM_RSSINLINK 0x0000000010
// is this a "modified date"? that means we could not find a valid pub
// date on the page or from rss info, but it changed significantly since the
// last time we spidered it, so we make a guess at the pub date.
#define DF_ESTIMATED 0x0000000020
// where we got the date from
#define DF_FROM_BODY 0x0000000040
#define DF_FROM_URL 0x0000000080
#define DF_FROM_RSSINLINKLOCAL 0x0000000100
#define DF_FROM_META 0x0000000200
#define DF_UNIQUETAGHASH 0x0000000400
// could it be american or european format?
#define DF_AMBIGUOUS 0x0000000800
#define DF_AMERICAN 0x0000001000
#define DF_EUROPEAN 0x0000002000
// . set if date is a bad format or we have an unknown date format
// . format can be "american" or "european" for a document
//#define DF_INHYPERLINK 0x0000004000
#define DF_ONGOING 0x0000004000
#define DF_MONTH_NUMERIC 0x0000008000
#define DF_REPEATTAGHASH 0x0000010000
#define DF_NOTIMEOFDAY 0x0000020000
// an explicitly specified time for the event which overrides all (facebook)
#define DF_OFFICIAL 0x0000040000
#define DF_STORE_HOURS 0x0000080000
// is it like "Tuesday at 7:30pm" but when we telescope up to find more
// dates, the next bunch of dor "Tuesdays"
#define DF_INBADTAG 0x0000100000
#define DF_BEFORE1970 0x0000200000
#define DF_CANONICAL 0x0000400000
#define DF_MATCHESURLDAY 0x0000800000
#define DF_MATCHESURLMONTH 0x0001000000
#define DF_MATCHESURLYEAR 0x0002000000
#define DF_IN_HYPERLINK 0x0004000000
#define DF_NONEVENT_DATE 0x0008000000
#define DF_FUZZY 0x0010000000
#define DF_LEFT_BOOKEND 0x0020000000
#define DF_RIGHT_BOOKEND 0x0040000000
#define DF_ASSUMED_YEAR 0x0080000000
#define DF_USEDASHEADER 0x0100000000LL
#define DF_INVALID 0x0200000000LL
#define DF_HARD_LEFT 0x0400000000LL
#define DF_HARD_RIGHT 0x0800000000LL
#define DF_COPYRIGHT 0x1000000000LL
#define DF_CLOSE_DATE 0x2000000000LL
// "doc last modified: "
#define DF_PUB_DATE 0x4000000000LL
#define DF_KITCHEN_HOURS 0x8000000000LL
#define DF_IN_LIST 0x0000010000000000LL
#define DF_DUP 0x0000020000000000LL
#define DF_SUB_DATE 0x0000040000000000LL
#define DF_HAS_STRONG_DOW 0x0000080000000000LL
#define DF_HAS_WEAK_DOW 0x0000100000000000LL
#define DF_AFTER_TOD 0x0000200000000000LL
#define DF_BEFORE_TOD 0x0000400000000000LL
#define DF_EXACT_TOD 0x0000800000000000LL
#define DF_EVENT_CANDIDATE 0x0001000000000000LL
#define DF_ONOTHERPAGE 0x0002000000000000LL
#define DF_WEEKLY_SCHEDULE 0x0004000000000000LL
#define DF_REGISTRATION 0x0008000000000000LL
#define DF_SCHEDULECAND 0x0010000000000000LL
#define DF_HAS_ISOLATED_DAYNUM 0x0020000000000000LL
#define DF_IN_CALENDAR 0x0040000000000000LL
#define DF_REDUNDANT 0x0080000000000000LL
#define DF_NOTKILLED 0x0100000000000000LL
#define DF_YEAR_UNKNOWN 0x0200000000000000LL
// for dates that telescope to store hours and have no specific daynum
// or list of daynums... range of daynums is ok
#define DF_SUBSTORE_HOURS 0x0400000000000000LL
#define DF_FIRST_IN_LIST 0x0800000000000000LL
#define DF_TIGHT 0x1000000000000000LL
#define DF_INCRAZYTABLE 0x2000000000000000LL
#define DF_TABLEDATEHEADERROW 0x4000000000000000LL
#define DF_TABLEDATEHEADERCOL 0x8000000000000000LL
//
// values for Date::m_flags5
//
#define DF5_IGNORE 0x0000000000000001LL
// . returns the timestamp in seconds since the epoch
// . returns 0 if no date found in the url itself
int32_t parseDateFromUrl ( char *url ,
int32_t *urlYear = NULL ,
int32_t *urlMonth = NULL ,
int32_t *urlDay = NULL );
// values for Date::m_type
#define DT_TOD 0x00000001 // (1:30pm utc,one to three am gmt)
#define DT_DAYNUM 0x00000002 // (23rd,25,sixteenth)
#define DT_MONTH 0x00000004 // (nov,11)
#define DT_YEAR 0x00000008 // (2009,09)
#define DT_DOW 0x00000010 // Day Of Week (monday,tues,...)
#define DT_HOLIDAY 0x00000080
#define DT_TIMESTAMP 0x00000100
//#define DT_MOD 0x00000200 // first second last
#define DT_RANGE 0x00000400
#define DT_LIST_OTHER 0x00000800
#define DT_COMPOUND 0x00001000
#define DT_TELESCOPE 0x00002000
// range types
#define DT_RANGE_TOD 0x00004000
#define DT_RANGE_DOW 0x00008000
#define DT_RANGE_MONTHDAY 0x00010000
#define DT_RANGE_DAYNUM 0x00020000
#define DT_LIST_DAYNUM 0x00040000
#define DT_LIST_MONTHDAY 0x00080000
#define DT_LIST_TOD 0x00100000
#define DT_LIST_DOW 0x00200000
#define DT_LIST_MONTH 0x00400000
#define DT_RANGE_TIMEPOINT 0x00800000
#define DT_SUBDAY 0x01000000 // night|nights|evening|mornings|afternoo
#define DT_SUBWEEK 0x02000000 // weekend,weekdays,weekends
#define DT_SUBMONTH 0x04000000 // lastdayofmonth,lastweekofmonth,...
#define DT_EVERY_DAY 0x08000000 // 7daysaweek,everyday,...
#define DT_SEASON 0x10000000 // summer,winters,spring,fall,autumn
#define DT_ALL_HOLIDAYS 0x20000000 // "holidays"
#define DT_RANGE_YEAR 0x40000000 // 2010-11
#define DT_RANGE_MONTH 0x80000000
#define DT_RANGE_ANY (DT_RANGE|DT_RANGE_TOD|DT_RANGE_DOW|DT_RANGE_MONTHDAY|DT_RANGE_DAYNUM|DT_RANGE_TIMEPOINT|DT_RANGE_YEAR|DT_RANGE_MONTH)
#define DT_LIST_ANY (DT_LIST_OTHER|DT_LIST_DAYNUM|DT_LIST_MONTHDAY|DT_LIST_TOD|DT_LIST_DOW|DT_LIST_MONTH)
#define DT_SPECIAL_TYPES (DT_HOLIDAY|DT_SUBDAY|DT_SUBWEEK|DT_SUBMONTH|DT_EVERY_DAY|DT_SEASON|DT_ALL_HOLIDAYS)
// . flags type
// . plenty of room for growth, 32 bits
typedef uint32_t datetype_t;
typedef uint32_t suppflags_t;
// these are just for DOWs now...
#define SF_PLURAL 0x000001
#define SF_FIRST 0x000002
#define SF_SECOND 0x000004
#define SF_THIRD 0x000008
#define SF_FOURTH 0x000010
#define SF_FIFTH 0x000020
#define SF_LAST 0x000040
#define SF_NON_FUZZY 0x000080
// did a time of day have an am/pm indicator or not?
#define SF_HAD_AMPM 0x000100
#define SF_NIGHT 0x000200
#define SF_AFTERNOON 0x000400
#define SF_MORNING 0x000800
#define SF_HAD_MINUTE 0x001000 // a TOD with a minute?
#define SF_NON 0x002000
#define SF_MID 0x004000
//#define SF_HOLIDAY_WORD 0x4000
#define SF_PM_BY_LIST 0x008000
//#define SF_NORMAL_HOLIDAY 0x010000
#define SF_RECURRING_DOW 0x020000
#define SF_EVERY 0x040000
#define SF_MILITARY_TIME 0x080000
#define SF_IMPLIED_AMPM 0x100000
#define SF_ON_PRECEEDS 0x200000
#define SF_SPECIAL_TOD 0x400000
int32_t getDOW ( time_t t );
int32_t getYear ( time_t t );
bool isTicketDate ( int32_t a , int32_t b , int64_t *wids , Bits *bits ,
int32_t niceness ) ;
class Date {
public:
// word range relative to m_words Words.cpp class
int32_t m_a;
int32_t m_b;
// used by Events.cpp for event titles algo
int32_t m_maxa;
int32_t m_mina;
// the types of Dates: (see #defines above)
// there are 8 bit flags. but only one bit is allowed to be set
// unless (m_flags & DF_COMPOUND) is true
datetype_t m_type;
// descriptor bits (see #defines above)
dateflags_t m_flags;
// we need more than 64 flags now!
dateflags_t m_flags5;
// types contained by this date
datetype_t m_hasType;
// modifiers to what we hold
suppflags_t m_suppFlags;
// the numeric value of what we represent
int32_t m_num;
// . these two guys are used by Dates::getDateElements()
// . how many date elements we consist of
int32_t m_numFlatPtrs;
// offset into Dates::m_cbuf of the list of those elements
int32_t m_flatPtrsBufOffset;
// the Dates class that contains us
class Dates *m_dates;
// the date # as added. used to set m_tmph now
uint32_t m_arrayNum;
// HACK: for 5pm - 2am, we now truncate to midnight so that
// "Saturday 5pm - 2am" does not have an interval that is really
// considered Friday night
int32_t m_truncated;
int32_t m_penalty;
int32_t m_tagHash;
int32_t m_turkTagHash; // without tag attributes in the hash (xcpt class)
int32_t m_dateTypeAndTagHash32;
int32_t m_occNum;
int32_t m_clockHash;
// if we are in a table, this is the table cell section which
// has m_headColSection and m_colNum, etc. set
class Section *m_tableCell;
//class Section *m_headColSection;
//class Section *m_headRowSection;
// for use by DT_COMPOUND types
char m_month;
int32_t m_year;
char m_dayNum;
// 1 through 7 = Sunday through Saturday
char m_dow;
int32_t m_tod;
time_t m_timestamp;
// for setting dowBits in Dates.cpp
//char m_minDow;
//char m_maxDow;
char m_dowBits;
int32_t m_minYear;
int32_t m_maxYear;
char m_minDayNum;
char m_maxDayNum;
// in seconds
int32_t m_minTod;
int32_t m_maxTod;
// . min pub date of the page that contains us
// . see Dates.cpp or XmlDoc.cpp for an explanation of this
// . this is taken from SpiderRequest::m_parentPrevSpiderTime
//time_t m_minPubDate;
// sometimes an event date does not have a year, so we try to guess
// a range of years it could fall on. we look at the years of other
// dates on the page and use those to make a range of years.
//int32_t m_minStartYear;
//int32_t m_maxStartYear;
// we guess the max year of a date that needs a year and does not have
// one, and we store the guess here
int32_t m_maxYearGuess;
// we scan for the min/max years on page from all event dates
// and then use that range to determine the year when other event dates
// occur, provided they have a dow/month/daynum (but no year) then
// we set this to that year.
int32_t m_dowBasedYear;
// convert years into time_t's. truncate m_maxStartFocus based on
// spideredTime.
int32_t m_minStartFocus;
int32_t m_maxStartFocus;
// supplmenetal value for "first/second/fifth thursday"
char m_supp;
// do not telescope past this section
//class Section *m_containingSection;
// the smallest section containing word # m_a
class Section *m_section;
class Section *m_compoundSection;
class Section *m_maxTODSection;
class Section *m_calendarSection;
class Date *m_lastDateInCalendar;
// we telescope m_section up until we hit a non-br and breaking
// section... i.e. a "hard" section
class Section *m_hardSection;
class Date *m_subdateOf;
class Date *m_dupOf;
// if we telescope, this guy essentially replaces us
class Date *m_telescope;
// what sentence number are we in? Dates.cpp uses this to disqualify
// dates as headers if they are in the same sentence
//int32_t m_sentenceId;
//int32_t m_sentStart;
//int32_t m_sentEnd;
void *m_used;
int32_t m_headerCount;
uint32_t m_tmph;
uint32_t m_ptrHash;
// . try to normalize so that two dates that represent the exact
// same times will have the same m_dateHash
// . i.e. "11am = 11:00 AM", "3/3/11 = March 3rd 2011"
uint64_t m_dateHash64;
uint64_t m_norepeatKey ;
int32_t m_norepeatResult ;
// usually the date ptr containing the tod, but in the case of
// burtstikilounge.com it is the daynum in that calendar layout.
// this is set late in the game in Events.cpp.
class Date *m_mostUniqueDatePtr;
// used for the above algo for setting m_mostUnqiueDatePtr
int32_t m_usedCount;
// kinda like m_mostUniqueDatePtr, but we dedup our telescope
// components, using this as the base. part of normalization
// and used in setDoNotPrintBits();
//class Date *m_coreDate;
// parent->m_ptrs[x] = this!
class Date *m_dateParent;
// used for re-sorting dates as part of printTextNorm() normalization
int32_t m_groupNum;
// . this is used for COMPOUND dates
// . this is also used for lists and ranges of basic dates
// . leave this open-ended! so Dates::getMem() can alloc for the max
// but we may actually end up using less!
int32_t m_numPtrs;
class Date *m_ptrs[];
void addPtr ( class Date *ptr , int32_t i , class Dates *parent );
void printText ( class SafeBuf *sb , class Words *words ,
bool inHtml = true ) ;
void printText2 ( class SafeBuf *sb , class Words *words ,
bool inHtml = true ) ;
bool printTextNorm ( class SafeBuf *sb , class Words *words ,
bool inHtml = true , class Event *ev = NULL ,
class SafeBuf *intbuf = NULL ) ;
bool printTextNorm2 ( class SafeBuf *sb , class Words *words ,
bool inHtml = true , class Event *ev = NULL ,
class SafeBuf *intbuf = NULL ) ;
void print ( class SafeBuf *sb ,
class Sections *ss ,
class Words *ww ,
int32_t siteHash ,
int32_t num ,
class Date *best ,
class Dates *dates );
bool isSubDate ( class Date *di );
bool addDoNotPrintDates ( class HashTableX *dnp );
bool addDoNotPrintRecursive ( datetype_t dt , class HashTableX *dnp ) ;
//int32_t getTextOffset ( int32_t num , int32_t *retEndOff, class Words *words);
// . is part of our compound date in this section?
// . flag which date types are in "si" and return that
// . used by Events.cpp to set EventDesc::m_flags so we
// can show that in the summary on the search results
// page.
//datetype_t getDateTypesInSection ( class Section *si );
//bool printNormalized2 ( class SafeBuf *sb , int32_t nicess ,
// class Words *words );
};
// used by Dates::hashStartTimes() and Dates::getIntervals()
class Interval {
public:
time_t m_a;
time_t m_b;
};
//#define MAX_DATE_PTRS 8000
#define MAX_POOLS 100
class Dates {
public:
Dates ();
~Dates ();
int32_t getStoredSize ( );
static int32_t getStoredSize ( char *p );
int32_t serialize ( char *buf );
int32_t deserialize ( char *buf );
void reset();
// . returns false if blocks, returns true otherwise
// . returns true and sets g_errno on error
// . if the content has changed a lot since last time we spidered
// it, then we will add "modified dates" to the list of pub date
// candidates. the DF_ESTIMATED flag will be set for those, and
// the low bit of such pub dates will be cleared. the low bit
// will be set on pub dates that are not estimated.
bool setPart1 ( Url *url ,//char *url ,
Url *redirUrl, // char *redirUrl ,
uint8_t contentType ,
int32_t ip ,
int64_t docId ,
int32_t siteHash ,
class Xml *xml ,
class Words *words ,
class Bits *bits ,
class Sections *sections ,
class LinkInfo *info1 ,
// . old title rec and xml and words
// . parsed up because we had to for adding
// deltas to indexdb
//class Dates *odp ,
HashTableX *cct , // replaces "odp"
class XmlDoc *nd , // new XmlDoc
class XmlDoc *od , // old XmlDoc
char *coll ,
int32_t niceness );
bool addVotes ( class SectionVotingTable *nsvt ) ;
bool hasKitchenHours ( class Section *si ) ;
//bool isTicketDate ( int32_t a , int32_t b , int64_t *wids ) ;
bool isFuneralDate ( int32_t a , int32_t b ) ;
bool isCloseHeader ( class Section *si ) ;
bool setPart2 ( class Addresses *aa ,
int32_t minPubDate ,
int32_t maxPubDate ,
// the old one - we read from that
//class SectionVotingTable *osvt ,
bool isXml ,
bool isSiteRoot ) ;
bool getIntervals2 ( Date *dp ,
SafeBuf *sb,
int32_t year0 ,
int32_t year1,
Date **closeDates ,
int32_t numCloseDates ,
char timeZone ,
char useDST ,
class Words *words ) ;
int32_t addIntervals ( class Date *di , char hflag , Interval *int3 ,
int32_t depth , class Date *orig );
int32_t addIntervalsB ( class Date *di , char hflag , Interval *int3 ,
int32_t depth , class Date *orig );
bool addInterval ( int32_t a , int32_t b , Interval *int3 , int32_t *ni3 ,
int32_t depth , bool useDayShift = true ) ;
bool addIntervalsForDOW ( int32_t num ,
class Interval *int3 ,
int32_t *ni3 ,
int32_t depth ,
int32_t year ) ;
int32_t intersect ( Interval *int1 ,
Interval *int2 ,
Interval *int3 ,
int32_t ni1 ,
int32_t ni2 ,
int32_t depth );
int32_t intersect2 ( Interval *int1 ,
Interval *int2 ,
Interval *int3 ,
int32_t ni1 ,
int32_t ni2 ,
int32_t depth );
int32_t intersect3 ( Interval *int1 ,
Interval *int2 ,
Interval *int3 ,
int32_t ni1 ,
int32_t ni2 ,
int32_t depth ,
bool subtractint2 ,
bool unionOp );
//time_t getYearMonthStart ( int32_t y , int32_t m );
// 4th monday of May 2009, for instance, use a dowArg of 2 (monday)
// and a count of 4. returns a time_t
//time_t getDOWStart ( int32_t y , int32_t m , int32_t dowArg , int32_t count);
datetype_t getDateType ( int32_t i , int32_t *val , int32_t *endWord ,
int64_t *wids , int32_t nw ,
bool onPreceeds ) ;
bool addRanges ( class Words *words ,
bool allowOpenEndedRanges = true ) ;
//void addOpenEndedRanges ( ) ;
bool addLists ( class Words *words ,
bool ignoreBreakingTags ) ;
bool makeCompounds ( class Words *words ,
bool monthDayOnly ,
bool linkDatesInSameSentence , // = false ,
//bool dowTodOnly , // = false );
bool ignoreBreakingTags ); // = false
class Date *getMem ( int32_t need );
class Date *addDate ( datetype_t dt , // DT_*
dateflags_t tf , // flags
int32_t a ,
int32_t b ,
int32_t num ); // data
// . must call set() above before calling this
// . mdw left off here
int32_t getPubDate ( ) {
return m_pubDate;
//if ( ! m_best ) return -1;
//if ( m_best->m_timestamp <= 0 ) {char*xx=NULL;*xx=0;}
//return m_best->m_timestamp;
};
int32_t getRSSPublishDate ( class Inlink *k ) ;
// returns -1 and sets g_errno on error
int32_t isCompatible ( class Date *di,
class Date *dp ,
class HashTableX *ht ,
class Date *DD ,
bool *hasMultipleHeaders );
// returns -1 and sets g_errno on error
int32_t isCompatible2 ( Section *s1 ,
Section *s2 , bool useXors );
//class Date *getFirstParentOfType( class Date *dd,
// class Date *last ,
// class HashTableX *ht );
// XmlDoc::hash() calls this to index the Dates stored in the
// TitleRec. pages from the same site can use these special termlists
// to see if their tag hashes are likely indicative of a clock or not
bool hash ( int64_t docId ,
class HashTableX *tt ,
class XmlDoc *xd );
bool checkPunct ( int32_t i , class Words *words , char *singleChar );
// returns false and sets g_errno on error
bool parseDates ( class Words *w , dateflags_t defFlags ,
class Bits *bits ,
class Sections *sections ,
int32_t niceness ,
Url *url ,
uint8_t contentType );
bool m_bodySet ;
Date **getDateElements ( class Date *di, int32_t *ne );
bool addPtrToArray ( class Date *dp );
SafeBuf m_cbuf;
int32_t getDateNum ( class Date *di ) ;
int32_t printDateNeighborhood ( class Date *di , class Words *w ) ;
bool printDates ( class SafeBuf *sb ) ;
int32_t printDates2 ( ) ;
// gdb can call this one:
int32_t print ( class Date *d );
bool getDateOffsets ( Date *date ,
int32_t num ,
int32_t *dateStartOff ,
int32_t *dateEndOff ,
int32_t *dateSentStartOff ,
int32_t *dateSentEndOff ) ;
// returns false and sets g_errno on error
int32_t parseTimeOfDay3 ( class Words *w ,
int32_t i ,
int32_t niceness ,
int32_t *endWordNum ,
struct TimeZone **tzPtr ,
bool monthPreceeds ,
bool *hadAmPM ,
bool *hadMinute ,
bool *isMilitary ) ;
void setEventBrotherBits();
void setDateParents ( ) ;
void setDateParentsRecursive ( class Date *di , class Date *parent ) ;
void setDateHashes ( ) ;
uint64_t getDateHash ( class Date *di , class Date *orig );
uint64_t getDateHash2 ( class Date *di , class Date *orig );
void setStoreHours ( bool telescopesOnly );
void setMaxYearGuesses ( ) ;
int32_t guessMaxYear ( int32_t i ) ;
int32_t calculateYearBasedOnDOW ( int32_t minYear, int32_t maxYear,
class Date *di );
//bool printNormalized1 ( class SafeBuf *sb ,
// class Event *ev ,
// int32_t niceness ) ;
Date **m_datePtrs;// [ MAX_DATE_PTRS ];
int32_t m_numDatePtrs;
// just like m_datePtrs[] but we do not NULL out any entries
// just because they were used to make a compound, list or range date
Date **m_totalPtrs;// [ MAX_DATE_PTRS ];
int32_t m_numTotalPtrs;
// we now (re)alloc these on demand as well
int32_t m_maxDatePtrs;
bool m_overflowed;
bool m_dateFormatPanic;
bool m_calledParseDates;
int32_t m_shiftDay;
// memory pools for holding Dates and/or Date::m_ptrs lists
char *m_pools[MAX_POOLS];
int32_t m_numPools;
//int32_t m_numDates;
char *m_coll;
//char *m_url;
//char *m_redirUrl;
Url *m_url;
Url *m_redirUrl;
int32_t m_siteHash;
// the old xmldoc, NULL if did not exist
class XmlDoc *m_od;
char *m_current;
char *m_currentEnd;
//int32_t m_now;
//bool m_canHash;
//int32_t m_besti;
// the defacto pubdate
class Date *m_best;
time_t m_pubDate;
//wbit_t *m_bits;
class Bits *m_bits;
int32_t m_niceness;
dateflags_t m_dateFormat ;
//bool m_gotDateFormatFromDisk ;
//int32_t m_urlDate ;
//int32_t m_urlDateNum ;
int32_t m_urlMonth ;
int32_t m_urlYear ;
int32_t m_urlDay ;
int32_t m_firstGood ;
int32_t m_lastGood ;
// the new xml doc, used for XmlDoc::m_spideredTime
class XmlDoc *m_nd;
class Words *m_words;
char **m_wptrs;
int32_t *m_wlens;
int64_t *m_wids;
nodeid_t *m_tids;
int32_t m_nw;
class Sections *m_sections;
int64_t m_docId;
int32_t m_spiderTime;
class Addresses *m_addresses;
// . how much we have changed from the last time spidered
// . is a percentage and ranges from 0 to 100
// . will be 0 if first time spidered
int32_t m_changed;
// like javascript, gif, jpeg, xml, html, etc.
uint8_t m_contentType;
// timeStruct breakdown of the XmlDoc::m_spideredTime (newDoc/nd)
struct tm *m_spts;
bool m_badHtml;
bool m_needQuickRespider;
int32_t m_year0;
int32_t m_year1;
class HashTableX *getSubfieldTable();
class HashTableX *getTODTable () { return &m_tt; };
class HashTableX *getTODNumTable () { return &m_tnt; };
void setPhoneXors ();
void setEmailXors ();
void setPriceXors ();
void setTODXors ();
void setDayXors ();
void setAddrXors ();
bool m_phoneXorsValid;
bool m_emailXorsValid;
bool m_todXorsValid ;
bool m_dayXorsValid ;
bool m_priceXorsValid;
bool m_ttValid;
bool m_tntValid;
bool m_sftValid;
bool m_dateBitsValid;
bool m_doNotPrintBitsValid;
HashTableX m_tt;
HashTableX m_tnt;
HashTableX m_sft;
// map sectionPtr to array of up to 64 bits. each bit represents
// a field name that is duplicated in the document, and that that
// section contains.
HashTableX m_bitTable;
int32_t m_numLongs;
//class SectionVotingTable *m_osvt;
HashTableX *m_rvt;
bool m_setDateHashes;
bool m_isXml ;
bool m_isSiteRoot ;
};
// now time zones
struct TimeZone {
char m_name[16];
// tzinfo:
int32_t m_hourMod;
int32_t m_minMod;
int32_t m_modType;
};
#define BADTIMEZONE 999999
// "s" is the timezone, like "EDT" and we return # of secs to add to UTC
// to get the current time in that time zone.
// returns BADTIMEZONE if "s" is unknown timezone
int32_t getTimeZone ( char *s ) ;
// . returns how many words starting at i are in the time zone
// . 0 means not a timezone
int32_t getTimeZoneWord ( int32_t i , int64_t *wids , int32_t nw ,
TimeZone **tzptr , int32_t niceness );
bool isDateType ( int64_t *pwid ) ;
// returns false and sets g_errno on error
bool getMonth ( int64_t wid , int32_t *retMonth ) ;
void resetDateTables ( );
uint32_t getDateSectionHash ( class Section *sn );
extern char s_numDaysInMonth[];
#endif