-
Notifications
You must be signed in to change notification settings - Fork 4
/
wikirend.py
2431 lines (2244 loc) · 79.3 KB
/
wikirend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#
# Rend(er) wiki-text into HTML.
#
# Documentation of what exactly wiki-text consists of is not here; look
# for it in dwiki/Formatting in the DWiki test.
#
# WikiText rendering should result in valid and properly nested HTML.
# Badly nested explicit list nesting is the only exception, because
# doing it 'properly' results in ugly looking results in common browsers.
#
import re, string
import htmlrends, derrors
import macros
import rendcache
# Options for how we render things; as flags, they're OR'd together.
# Rendering options.
NOLINKS = (1<<0)
ABSLINKS = (1<<1)
NOFOLLOW = (1<<2)
TITLEONLY = (1<<3)
# internal use only
# these can only be set when we are doing a relatively full rendering,
# one that makes the specific type of additional result valid.
set_features_option = (1<<8)
set_title_option = (1<<9)
rendering_flags = (set_features_option | set_title_option)
# Macro restrictions.
# Normally we allow everything (except CutShort, which must be
# explicitly enabled always).
# In NOMACROS, no macros act and they render as themselves.
# In SOMEMACROS, only allowed macros render, and disallowed macros
# render as '{macro suppressed}'.
NOMACROS = (1<<16)
SOMEMACROS = (1<<17)
ALLOW_RESTRICTED = (1<<18)
ALLOW_CANCOMMENT = (1<<19)
ALLOW_CUTSHORT = (1<<20)
ALLOW_IMG = (1<<21)
macro_perms = {
'Restricted': ALLOW_RESTRICTED,
'CanComment': ALLOW_CANCOMMENT,
'CutShort': ALLOW_CUTSHORT,
'IMG': ALLOW_IMG,
}
# Things that are blocked unless they are explicitly permitted.
macro_block_perms = {'CutShort': ALLOW_CUTSHORT,}
# Flags for various modes.
# (I think that increasingly, set_features_option is the default.)
terse_flags = (SOMEMACROS | ALLOW_RESTRICTED | ALLOW_CANCOMMENT |
ALLOW_CUTSHORT | ALLOW_IMG | set_features_option )
# Flags used when we are rendering to check results.
check_flags = (SOMEMACROS | ALLOW_RESTRICTED | ALLOW_CANCOMMENT |
set_features_option | NOLINKS)
# This collection of regular expressions matches the various classes
# of lines that can occur in wikitext. Generally speaking, the second
# match group is the content (except for cases where there IS no
# content); sometimes we have to introduce bogus groups to make this
# happen.
# All regexps are re.match()'d, not re.search()'d, so they are
# implicitly anchored at the start as well as the explicit end
# anchor.
lineClassRegexps = (
# The following pattern must *not* include any character that starts
# another element.
# todo: change \d somehow to fix accidental numbered lists trap
(re.compile(r'([^\s=*>{|#\d+.-].*)'),'p'), # common case - other p's below
(re.compile(r'\s*$'),'blank'),
(re.compile(r'(\s(\s*))(\S.*)'),'indented'), # either pre or continuation
(re.compile(r'(=+)\s(.+)?'),'header'),
(re.compile(r'\*\s+\*\s+\*\s*$'),'starsep'),
(re.compile(r'(>(?:\s>)*(?:\s|$))()(.*)'),'quote'),
(re.compile(r'\|\s+(.+)'),'table'),
(re.compile(r'\|_.\s+(.+)'),'horitable'),
(re.compile(r'(\*)(\s+)(.+)'),'ul'),
# this should generate a different token, but implementation issues
# make it simpler to fix up in the general list handler.
(re.compile(r'(\+)(\s+)(.+)'),'ul'),
# todo: see above re \d.
(re.compile(r'([\d#])(\s+)(.+)'),'ol'),
# this blows up on '- [[abc http://somesite/]]: foobar ...', so
# we need the more complicated version.
#(re.compile(r'(-)(\s)(\S[^:]*)(:\s+)(.+)'),'dl'),
(re.compile(r'(-)(\s)(\S(?:[^:]|:(?!\s))*)(:\s+)(.+)'),'dl'),
(re.compile(r'-{4,}\s*$'),'hr'),
# The normal innerlist cannot include '-', because otherwise
# it swallows '-- signoff', which is *not* an indented list.
# We must match '--' only when it has a valid 'dl' format,
# which we clone from the 'dl' regexp above.
(re.compile(r'([0#*+])()((?:\1)+\s.*)'),'innerlist'),
(re.compile(r'(-)()(-+\s\S[^:]*:\s+.+)'), 'innerlist'),
# this also affects the basic case at the start, which must
# include . as a stop character.
(re.compile(r'\.pn\s+(.+)'), 'procnote'),
(re.compile(r'{{CutShort(|:[^}]+)}}\s*$'), 'cutshort'),
# THIS MUST BE THE LAST CASE
# cks keeps forgetting this.
(re.compile(r'(\S.*)'),'p'),
)
def expandedLength(tabsize, astr, prevPos):
offset = prevPos
for c in astr:
if c != '\t':
offset += 1
else:
offset -= offset % tabsize
offset += tabsize
return offset
def getTokFromLine(line, qoffset, linenum):
__pychecker__ = "no-returnvalues"
match = None
lineClass = None
for x in lineClassRegexps:
match = x[0].match(line)
if match:
lineClass = x[1]
break
if lineClass == 'quote' or lineClass == 'indented' or \
lineClass == 'innerlist':
newoffset = expandedLength(8, match.group(1), qoffset)
innerTok = getTokFromLine(match.group(3), newoffset, linenum)
return (lineClass, line, qoffset, match, innerTok)
if lineClass == 'header' and 1 == linenum:
lineClass = 'header1'
return (lineClass, line, qoffset, match)
s_ents = ('i', 'b', 'tt', 'p', 'pre', 'table', 'ul', 'ol', 'li', 'blockquote',
'em', 'strong', 'code', 'dl', 'dt', 'dd', 'h1', 'h2',
'h3', 'h4', 'h5', 'h6', 'tr', 'td',
'big', 'small', 'strike', 'sub', 'sup', 'u', )
x = None
start_entity = dict([(x, "<%s>" % x) for x in s_ents])
end_entity = dict([(x, "</%s>" % x) for x in s_ents])
# shut pychecker up.
del x
# TODO: should have a better way of styling these. CSS, anyone?
# See http://www.w3.org/TR/REC-html40/struct/tables.html
# Also http://www.w3.org/TR/1998/REC-CSS2-19980512/tables.html
# We style with CSS these days, but supply explicit table styling to
# cope with CSS-less browsers so the result does not look like ass.
start_entity['table'] = '<table class="wikitable" border="1" cellpadding="4">'
start_entity['td'] = '<td valign="top">'
start_entity['ntd'] = '<td valign="top" align="right">'
start_entity['horitable'] = '<table class="wikitable horizontal">'
# This is researched from various places. 'word-wrap: break-word' is
# apparently not necessary any more so I am leaving it out for now.
start_entity['prewrap'] = '<pre style="white-space: pre-wrap;">'
# Easiest way to handle this:
end_entity['horitable'] = end_entity['table']
end_entity['ntd'] = end_entity['td']
end_entity['prewrap'] = end_entity['pre']
#fake entity for straight-to-two lists:
start_entity['innerlist'] = '<div style="margin-left: 2em">'
end_entity['innerlist'] = '</div>'
# Bits of cell and table line matching.
table_cell_boundary_re = re.compile(r'(?<!\S)\|(?!\S)')
# This matches 'numeric' table cell entries, ones that should be set
# flush right instead of flush left. This deliberately excludes
# apparent percentiles, for reasons that I am fuzzy about.
numeric_td_match_re = re.compile(r'^(?:~~|\*)?[-+]?(?:[0-9,]+|[0-9]+\.[0-9]+)(?:~~|\*)?$')
# These surround generated wiki HTML.
wikitext_start = '<div class="wikitext">'
wikitext_end = '</div>'
# Used in splitting apart space-separated brlinks
# Because these may contain embedded newlines, we must use some flags.
lastspace_re = re.compile(r'^(.+)\s+(\S+)$', re.MULTILINE|re.DOTALL)
# Special preformatted marking. This has to occur at the very start
# of the content. (Note that it is matched against the *raw* content,
# since it includes a \n.)
# We have since expanded this to include a 'search' directive, which
# expands the search path for places to find pages.
pragma_re = re.compile(r'^#pragma (pre|plaintext|search .*)\n')
def get_pragma(data):
mo = pragma_re.match(data)
if not mo:
return None
else:
return mo.group(1)
def is_plaintext(data):
return get_pragma(data) in ('pre', 'plaintext')
def search_dirs(data):
pr = get_pragma(data)
if pr and pr.startswith("search "):
sl = []
# Cope with people's desires to start paths with '/'
# to mean 'absolute, honest'. Possibly I should deal
# with this upstream.
for e in pr.split()[1:]:
if e and e[0] == '/':
e = e[1:]
sl.append(e)
return sl
else:
return []
# Valid HTML and XML cannot include certain ranges of control
# characters, so we must suppress any that pop up for correctness.
# This is especially important for XML, since a) Atom is an XML
# dialect and b) various Atom/XML parsers just refuse your document
# entirely if it's invalid.
# (See the comments in atomgen.py for more depression on this, as
# one actually needs to exclude certain *Unicode* characters too.)
#
# Control character eating in regular text is done in the prelinestuff
# matching array and inline_control(), because this approach seems the
# fastest and best. However, ((..)), [[...|]], and <pre> blocks go
# directly to inline_plaintext(), so we must also handle it there.
#
# (if we wanted to just eat control characters, or do a straight
# substitution to ? or something, string.translate might be faster.
# However, note that we expect this to almost never hit.)
#
# To do this 100% correctly we would have to worry about links and
# a few other sources of stray characters.
#
# This is not entirely correct here. The XML standard says that
# 127-159 are bad; however, we are interpreting bytes, not Unicode
# characters. UTF-8, the most likely encoding, actually uses *bytes*
# in the 128-159 for the right Unicode characters. Fixing this is
# intractable in DWiki's current model.
# This runs range(0, 32) because range() excludes the end point; the
# real range is 0-31 inclusive (minus tab, LF, and CR).
def cc_chars():
return "".join([chr(x) for x in range(0, 32) + [127,]
if x not in (9, 10, 13)])
def cc_pat():
return "["+cc_chars()+"]"
cchar_re = re.compile(cc_pat())
# The processing of "inline" markup is done in a lookup table. Before
# we can make the lookup table, however, we need to bind all the
# names, so here they are. Note that inline_plaintext is aliased as
# htmltext inside the class.
def inline_plaintext(rend, txt):
txt = txt.replace('&', '&')
txt = txt.replace('<', '<')
# XXX: replacing > is unnecessary and bulks up the text.
#txt = txt.replace('>','>')
# XXX: we leave " and ' strictly alone; what the browser sees
# is what the user wrote, not what we think should be seen.
#txt = txt.replace('"','"')
#txt = txt.replace("'",''')
# We must eat bad control characters. The simplest and
# reasonably fast way is to do it with a regexp substitute,
# which is very fast for a) short text and b) misses, which
# is what we expect.
txt = cchar_re.sub("{bad character}", txt)
rend.result.append(txt)
# NOTE: inline_font is special magic, because it gets and returns the
# text. (Under some circumstances it does lookahead and the like,
# and swallows extra text.)
ere_text = '(.*?)[^\s]X'; sre_text = 'X[^\s]';
font_end_res = {}; font_start_res = {}
for i in ('*', '~~'):
font_end_res[i] = re.compile(ere_text.replace('X', re.escape(i)))
font_start_res[i] = re.compile(sre_text.replace('X', re.escape(i)))
# '_' has relaxed rules; anything qualifies. This is because _ affects
# text spacing, so I have been known to write things like '_ \\ _' and
# want it to come out JUST LIKE THAT. Arguably I should have used ((..))
# for this, but I didn't so that's life and we cope. - cks
font_end_res['_'] = font_start_res['_'] = re.compile("(.*?)_")
# Reverse inline font styles by reaching back into the rendered content
# and converting the start tag to its original character. This is a
# hack, but a convenient one that fixes various rendering issues for
# stuff that people write.
#
# As a hack, it only works really when closing improperly nested font
# styles, because in this case we can be fairly certain that we are
# reaching back to fix the proper opening. It cannot be used to fix
# up paragraphs without various sorts of malfunctions; for them, the
# errant unclosed font style just runs to the end of the paragraph.
#
# This should not be necessary if we properly matched start and end
# tags in general, but we only do purely textual forward lookup for
# end tags and their positions and that can be fooled by tags inside
# various nesting constructs (eg '((...))' and '[[..]]'). Really we
# need an additional resolution pass over the raw tokenized text.
#
# Tricky bit: this reversal can never misfire in the case of '(('
# (ie, replacing what was a '((' in the original text with a '_')
# because we only start a (( only if we see the ending )) and by
# definition nothing inside the (( can escape the )).
_lpairs = { "</em>": ["<em>", "*"], "</strong>": ["<strong>", "~~"],
"</code>": ["<code>", "_"], }
def unwind_inline(rend, offtag):
assert offtag in rend.inlineEndStack
while rend.inlineEndStack:
s = rend.inlineEndStack.pop(0)
if s == offtag:
break
if s not in _lpairs:
rend.result.append(s)
continue
src, rep = _lpairs[s]
for i in range(len(rend.result)-1, -1, -1):
if rend.result[i] == src:
rend.result[i] = rep
break
else:
# should never happen?!
rend.result.append(s)
rend.result.append(offtag)
# we would like to use _spacepunc, but this is not viable in the face
# of unicode. _letters is not really viable either, but it misfires
# less often because we check for inclusion instead of exclusion.
#_spacepunc = string.whitespace + string.punctuation + '\n'
_letters = string.letters + string.digits
def inline_font(rend, style, text, last):
if style == '*':
hstyle = 'em'
elif style == '~~':
hstyle = 'strong'
elif style == '_':
hstyle = 'code' # why not tt? What's the difference?
# Doubled characters would normally produce empty HTML bits, which
# at least HTML tidy complains about. Rather than have yet another
# way of quoting things, we choose to make them produce themselves.
elif style in ('**', '__', '~~~~'):
rend.result.append(style)
return
# If this style is blocked by a nesting element, we must ignore
# this style. Consider '*test [[case* http://google.com/]]'.
if rend.blocked_style(hstyle):
rend.result.append(style)
return
# Is this style currently disabled?
if style in rend.disabledStyles:
rend.result.append(style)
return
# Closing things is the easy case, so we handle it and then
# bail.
offtag = end_entity[hstyle]
if offtag in rend.inlineEndStack:
#s = rend.inlineEndStack.pop(0)
#while s != offtag:
# rend.result.append(s)
# s = rend.inlineEndStack.pop(0)
#rend.result.append(s)
# rather than closing off unterminated inline styles,
# we unwind them, turning the start tag into its original
# string. this is imperfect.
unwind_inline(rend, offtag)
return
# We insist that start tags be followed by non-whitespace.
# Otherwise, they're just themselves.
if not text or (style != '_' and text[0] in string.whitespace):
rend.result.append(style)
elif style == '_' and last and last[-1] in _letters:
# this must be specific to _ because embedded emphasis
# is perfectly okay; we just want to avoid 'a_name_here'
# turning into 'a((name))here'.
# TODO: check for text[0] in _letters as well?
rend.result.append(style)
else:
# The complicated case; basically, we insist on minimal
# complete spans. This means that:
# a) there must be a valid ending tag for this font start
# (valid end tags have non-whitespace before them)
# b) there must not be any other valid font starts between
# us and the first end tag found.
mo = font_end_res[style].search(text)
if not mo or font_start_res[style].search(mo.group(1)):
rend.result.append(style)
else:
rend.inlineEndStack.insert(0, offtag)
rend.result.append(start_entity[hstyle])
# if we fail to check for _ nesting like this, then our
# output looks wrong after going through html tidy
def inline_code(rend, txt):
alreadycode = (end_entity['code'] in rend.inlineEndStack)
if not alreadycode:
rend.result.append(start_entity['code'])
rend.handle_plaintext(txt)
if not alreadycode:
rend.result.append(end_entity['code'])
def inline_macro(rend, txt):
if not rend.macro(txt):
rend.result.append('{{')
rend.handle_plaintext(txt)
rend.result.append('}}')
def inline_http(rend, txt):
linkend = rend.makelinkstart(txt)
rend.handle_plaintext(txt)
if linkend:
rend.result.append('</a>')
def inline_brlink(rend, txt):
if not rend.brlink(txt):
rend.result.append('[[')
rend.handle_text("text", txt)
rend.result.append(']]')
def inline_br(rend, txt):
__pychecker__ = "no-argsused"
rend.result.append("<br>\n")
# WikiLinks only activate if they are actually live pages.
# WikiLinks are either absolute (preferred) or in the same
# directory. The latter turns out to be what I want about
# 99% of the time.
def inline_wikiword(rend, link):
# Shortcut us if we aren't resolving links.
if rend.options & NOLINKS:
rend.result.append(link)
return
# CHECKME
# apparently all the link-finding stuff below can't find
# words with slashes on the end, so we do the link search
# without the slash
linkc = link
if linkc[-1] == '/':
linkc = linkc[:-1]
# We try first as an absolute path, then as a relative
# path, then finally as something in our alias area.
url = False
cp = None
# Check the cache to see if we have a result already.
if linkc in rend.wikiwordCache:
url = rend.wikiwordCache[linkc]
else:
for cp in (rend.mod.get_page(linkc),
rend.mod.get_page_relname(rend.ctx.page, linkc),
rend.mod.get_alias_page(linkc),
rend.mod.get_page_paths(rend.searchPath, linkc)):
if cp and cp.exists():
break
if cp:
url = page_url(rend.ctx, cp)
rend.wikiwordCache[linkc] = url
if url:
linkend = rend.makelinkstart(url)
rend.result.append(link)
if linkend:
rend.result.append('</a>')
else:
rend.result.append(link)
def inline_control(rend, txt):
if len(txt) > 1:
pl = "characters"
else:
pl = "character"
rend.result.append("{bad %s}" % pl)
def double_paren_pat(opn, cls):
(bo, bc) = ("\\" + opn, "\\" + cls)
(do, dc) = (2 * bo, 2 * bc)
notc = '[^%s]' % bc
return ("%sX(?:%s|%s%s)+Y%s" % (do, notc, bc, notc, dc))
# Bad things happen if the first character subsets of two
# inline patterns overlap, so avoid doing that.
prelinestuff = [
['_*~', r'X(?:__|\*\*|~~~~|_|\*|~~)Y', inline_font],
['(', double_paren_pat('(',')'), inline_code],
# ``...'' to quote text.
["`", double_paren_pat("`", "'"), inline_plaintext],
# XXX: this currently matches just '\\<newline>', not the
# documented ' \\<newline>'. For now I will pass.
# (To solve this we could use a '(<= )' lookbehind assertion,
# but then we would have to switch the entire big loop to using
# and keeping offsets into the text.)
['\\', 'XY' + ('\\' * 4) + '(?:$|\n)', inline_br],
#['<>&"\'', 'X[\'"<>&]+Y', inline_plaintext],
['<&', 'X[<&]+Y', inline_plaintext],
# We need to handle (invalid) control characters somehow.
[cc_chars(), 'X%s+Y' % cc_pat(), inline_control],
['!', r'!X(?:\[\[|\{\{|\(\(|https?://|``)Y', inline_plaintext],
# bang marks the end of "limited" text subset
['{', double_paren_pat('{','}'), inline_macro],
['[', double_paren_pat('[',']'), inline_brlink],
['h', r'\bXhttps?://[A-Za-z0-9](?:(?![.,;\)"\']*(?:\s|$)).)*Y', inline_http],
# This regexp is *so* last Monday:
# ['A-Z', r'\bX[A-Z][a-z0-9./]*[A-Z][A-Za-z0-9./]*[A-Za-z0-9/]Y', inline_wikiword],
['A-Z', r'(?<!/)\bX[A-Z][a-z0-9./]*[A-Z][A-Za-z0-9./]*[A-Za-z0-9/]Y', inline_wikiword],
['/', r'(?<![A-Za-z0-9])X/[A-Z][a-z0-9./]*[A-Z][A-Za-z0-9./]*[A-Za-z0-9/]Y', inline_wikiword],
]
textrehash = {}
textcodehash = {}
ltextrehash = None
plainre = []
bangplainre = None
# Done to contain temporary variables, because heaven forbid we have
# actual nested scopes aside from function definitions.
def maketexttables():
# probably don't need to declare textrehash and textcodehash
global textrehash, textcodehash, ltextrehash, plainre, bangplainre
firstcharcollection = []
firstcharbitre = re.compile('(.)(?:-(.))?')
for linespec in prelinestuff:
(firstchar, lre, cdref) = linespec
isbang = (firstchar == '!')
plainrebit = lre.replace('X','').replace('Y','')
lre = lre.replace('X','(').replace('Y',')')
lre = re.compile(lre)
mo = firstcharbitre.match(firstchar)
while (mo):
keys = (ord(mo.group(1)),)
if mo.group(2):
keys = range(ord(mo.group(1)), ord(mo.group(2))+1)
for key in keys:
if not chr(key).isalnum():
firstcharcollection.append('\\')
firstcharcollection.append(chr(key))
textrehash[key] = lre
textcodehash[key] = cdref
firstchar = firstchar[mo.end(0):]
mo = firstcharbitre.match(firstchar)
plainre.append(plainrebit)
if isbang:
ltextrehash = dict([(k,v) for (k,v) in textrehash.iteritems()])
bangplainre = '((?:[^%s]+|(?!%s).)+)' % \
(''.join(firstcharcollection), '|'.join(plainre))
plainre = '((?:[^%s]+|(?!%s).)+)' % \
(''.join(firstcharcollection), '|'.join(plainre))
plainre = re.compile(plainre)
bangplainre = re.compile(bangplainre)
maketexttables()
del maketexttables # and all our temporaries go away
del prelinestuff
del double_paren_pat
# The pattern used to split potential stop words to determine how to
# make them go.
# This is not deducable from bangplainre, because we want only style and
# other special characters.
stopw_char_re = re.compile(r"(.*?)([_~*({\[!<&])(.*)")
class StopWord:
def __init__(self, pref, o, suf):
self.pref = pref
self.o = o
self.suf = suf
self.rest = chr(o) + suf
# Canonicalize link text for our link abbreviations.
def canon_ltext(ltext):
return " ".join(ltext.split())
def list_para_start(tok):
if tok[3].group(1) == '+':
return "para"
else:
return "ptext"
#
# Generate a dictionary of all versions of the title given the actual
# HTML of the title plus the start and HTML elements around it (always <hN>
# and </hN>). This title information dictionary will be saved as a single
# cacheable object that all title rendering functions draw from.
#
# We can reliably strip HTML and just links because we know that our
# input is well formed, in fact formed in a specific way.
stripa_re = re.compile("<(a|/a)[^>]*>")
striphtml_re = re.compile("<[^>]+>")
def gen_title_dict(start, title, end):
res = {}
res['title'] = title
res['html'] = "%s%s%s" % (start, title, end)
res['nohtml'] = striphtml_re.sub("", title)
res['nolinks'] = stripa_re.sub("", title)
return res
def set_ctx_titleinfo(ctx, titleinfo):
if not titleinfo:
return
ctx.setvar(":wikitext:title", titleinfo['title'], True)
ctx.setvar(":wikitext:title:nohtml", titleinfo['nohtml'], True)
ctx.setvar(":wikitext:titleinfo", titleinfo, True)
# This class collects all of the results from rendering a wikitext page
# and is what is returned from WikiRend.render(). After you've gotten
# it, it is normally your responsibility to call .add_to(ctx) to add
# the rendering results to the context.
# HTML is extracted by calling .html(ctx), possibly with additional
# options. Note that you can't render a result without a context
# because the context is what allows us to apply CutShort restrictions
# (if any).
#
# The core rendering result is a list of 'blocks', which are basically
# chunks of HTML. Blocks have a type, some additional information
# attached to the type, and the generated HTML. Chunking the HTML
# and attaching types allows us to do things like 'skip the title'
# or 'stop after the first <p> block'. The type is usually the type
# of the HTML block element that the block contains, but this can
# break down at some point (the short version is that wikitext HTML
# generation is not completely structured); at that point you start
# getting only generic chunks that contain who knows what (and which
# likely have multiple HTML block elements in them).
#
# Blocks are also used to handle the CutShort macro. Regardless of
# CutShort, WikiRend always processes the entire page and chunks up
# the result. CutShort macros introduce special 'cutshort' chunks;
# during HTML generation we spot these and potentially stop
# processing. Handling CutShort in postprocessing means that the
# results of wikitext rendering can be cached and used generally,
# instead of needing separate caches for normal versus CutShort
# (actually each separate CutShort context possible).
#
# Mechanically a block is a two-element tuple, (WHAT, RESULTS).
# RESULTS is a list of strings of the actual HTML output that will
# normally be used (although once it's passed to RendResults in
# .add_block() it is immediately crushed down to a single
# string). WHAT is at least a one-element tuple; the first element is
# a string that describes its type and any subsequent elements are
# additional data. So far only cutshort has additional data; the
# cutshort chunk is:
# ('cutshort', (viewlist,), 'read more HTML')
#
# (viewlist is the list of views to cut in, or 'all'. The 'read more
# HTML' is the HTML that will be added if the cut is active.)
#
class RendResults(object):
def __init__(self):
self.blocks = []
self.empty = False
self.titleInfo = None
self.options = 0
self.features = None
self.spath = None
self.cacheable = True
# Add all elements of our results to the context et al.
# NOTE that this must be called with ctx.page as the page we were
# rendered for. Other use is invalid.
def add_to(self, ctx):
if self.options & set_features_option:
for f in self.features:
ctx.addfeature(f)
set_cache_features(self.features, ctx.page, ctx)
if not self.cacheable:
ctx.addfeature('indirect-macros')
if self.options & set_title_option and self.titleInfo:
set_ctx_titleinfo(ctx, self.titleInfo)
if self.spath:
ctx.setvar(":wikitext:search", self.spath, True)
if (self.options & rendering_flags) == rendering_flags:
ctx.setvar(":wikitext:render", self, True)
# As an engineering decision we immediately compact the list of
# HTML to a single string. One reason for this is that it makes
# the pickled version held in the disk cache simpler and smaller.
def add_block(self, what, data):
self.blocks.append((what, ''.join(data)))
# returns whether or not there is even a block of a given type
# in the blocks.
def hasa(self, what):
l = filter(lambda x: x[0][0] == what, self.blocks)
return bool(len(l))
# Skip initial blocks of type skip, plus 'blank'
# if stopafter is given, stop after the first block of that type.
# cutshort is true if cutshort blocks can do anything.
def _filter(self, view, skip=None, stopafter=None, cutshort=False):
r = []
for what, data in self.blocks:
if skip and what[0] in (skip, 'blank'):
continue
skip = None
# The data payload of a cutshort block is empty,
# because it's what's rendered when we *aren't*
# cutting short.
# what[1] is the views we cut short in or ('all',),
# what[2] is the teaser text.
if cutshort and what[0] == 'cutshort' and \
view != "normal" and \
(view in what[1] or 'all' in what[1]):
r.append(what[2])
break
if data:
r.append(data)
if stopafter and what[0] == stopafter:
break
return r
# Actually generate HTML, or generate absolutely nothing if
# we are marked as explicitly empty. Explicit empty RendResults
# are the result of rendering access-restricted pages that don't
# allow you access.
# The generated HTML has the wikitext <div> around it.
def html(self, ctx, skip = None, stopafter = None,
cutshort = False):
if self.empty:
return ''
return ''.join([wikitext_start] + \
self._filter(ctx.view, skip, stopafter,
cutshort) + \
[wikitext_end])
# Debugging interfaces:
def _render(self):
return ''.join(self._filter('normal'))
def _dump(self):
for btype, data in self.blocks:
print "==", btype, "=="
print ''.join(data)
# ----
# Resolving destinations to URLs
# used inside makelinkstart to detect urls that need absolutin'
absurlre = re.compile('[a-zA-Z0-9]+:')
def is_absurl(url):
return bool(absurlre.match(url)) and \
not url.lower().startswith("javascript:")
def page_url(context, page):
url = context.nurl(page)
# Is the target a redirection, and if so does the
# target of the redirection exist?
res = page.redirect_target()
if res:
if res[0] != 'page':
url = res[1]
elif res[1] and res[1].exists():
url = context.nurl(res[1])
# We decline to walk multiple steps of redirections,
# because then we'd have to figure out if we were
# looping.
return url
def wikilink_to_url(context, link, searchPath):
if is_absurl(link):
url = link
deflname = link
elif link[0] == '<' and link[-1] == '>' and link[1] == '/':
url = link[1:-1]
deflname = url
else:
npage = context.model.get_page_relname(context.page, link)
if searchPath and \
(not npage or not npage.exists()):
np = context.model.get_page_paths(searchPath, link)
if np:
npage = np
if npage:
url = page_url(context, npage)
deflname = npage.name
else:
deflname = link
url = link
return (url, deflname)
# Quote the link URL properly.
def quote_link_url(tgt):
tgt = tgt.replace('&', '&').replace('"', '%22')
tgt = tgt.replace('>', '%3E').replace(' ', '%20')
return tgt
# We use a class to contain rendering because it simplifies the job of
# holding related data all accessible.
# We render some particular data in the context of a page. Often the
# data is the contents of the page, but not always (eg comments render
# through this).
class WikiRend:
def __init__(self, data, context, options = None):
self.data = data
self.ctx = context
self.mod = context.model
self.web = context.web
# Used to push HTML fragments into a RendResults() object.
self.rres = RendResults()
self.result = []
self.spos = None
self.pushing = True
self.blockEndStack = []
self.inlineEndStack = []
self.tokqueue = []
self.features = []
self.linkNames = {}
self.linkUrls = {}
self.wikiwordCache = {}
self.abbrCache = {}
self.imgCache = {}
self.blockedStyles = []
self.titleInfo = None
self.useLists = True # macros refer to this
self.usePageTitles = False
self.hasComplex = False
self.searchPath = []
self.disabledStyles = {}
self.textSubs = {}
self.preWraps = False
if options is None:
self.options = 0
else:
self.options = options
# stopwords are complicated.
# stopWords indexes words -> sw objects.
# sw_index goes from special character to the list of
# sw for that character.
self.stopWords = {}
self.sw_index = {}
self.stopPerm = {}
self.sw_cache = {}
# Load global stopwords, which cannot be removed.
if 'literal-words' in context.cfg:
for w in context.cfg['literal-words']:
self.add_stopword(w, True)
def render(self, options = None):
if options:
self.options = options
try:
self.run()
self.force_block('end')
except macros.ReturnNothing:
self.result = []
self.rres.blocks = []
self.rres.empty = True
self.ctx.unrel_time()
self.titleInfo = None
# This one is complicated.
# Consider: a readable file, in a directory
# with an __access that has {{Restricted:user}}
# and {{CanComment:user}}, and a higher level
# __access file that would allow commenting.
# The intention is to disable access to everything
# in the directory, except this file, and to allow
# commenting on nothing.
# If we zapped features narrowly, we would kill
# the CanComment. (Which must be before the
# Restricted, whee.)
self.features.append('restricted')
self.rres.options = self.options
self.rres.features = self.features
self.rres.cacheable = not self.hasComplex
self.rres.titleInfo = self.titleInfo
self.rres.spath = self.searchPath
#self.rres._dump()
return self.rres
#
# Move chunks of the accumulated HTML text into the RendResult
# object.
# We try to push top level block elements in as they are
# generated. However not all block elements push themselves
# so we take care not to mis-label what we push into the
# result.
# CutShort also explicitly pushes blocks in.
# Push a block if it is safe. self.lpos is the last point we
# pushed up to (well, where we expect the next push to start
# from if there are no surprise elements); self.spos is where
# this block element starts from.
def push_block(self, btype):
if not self.pushing:
return
if self.spos != 0:
self.pushing = False
return
self.rres.add_block((btype,), self.result)
self.result = []
# Push a blank line in.
def push_blank(self):
if not self.pushing:
return
if len(self.result) != 1:
self.pushing = False
return
self.force_block('blank')
# Flush a block by force. btype should never be a normal block
# element type.
# As an engineering decision we do not resume regular block
# element pushing after a force even though we could. Pushing
# block elements separately is only useful when it reliably
# tracks the actual HTML structure. Once pushing is false, that
# structure is off. Resychonizing still doesn't fix that gap in
# the middle.
def force_block(self, btype):
if not len(self.result):
return
self.rres.add_block((btype,), self.result)
self.result = []
# ----
def pull(self):
if self.tokqueue:
return self.tokqueue.pop(0)
if not self.data:
return None
tl = self.data.split('\n', 1)
if len(tl) > 1:
self.data = tl[1]
else:
self.data = None
self.currentLineNumber += 1
return getTokFromLine(tl[0], 0, self.currentLineNumber)
def pushBack(self, tok, *othertoks):
if othertoks:
self.tokqueue[0:0] = othertoks
if tok:
self.tokqueue.insert(0, tok)
def run(self):
# BUG: you should not be able to use pragmas except for
# real page rendering. Results in comments will be what
# they call 'interesting'. This probably needs a specific
# flag for 'not rendering page contents', or maybe 'respect
# pragmas'.
if is_plaintext(self.data):
tl = self.data.split('\n', 1)
if len(tl) > 1 and tl[1]:
self.result.append('<pre>')
self.handle_plaintext(tl[1])
self.result.append('</pre>')
return
sr = search_dirs(self.data)
# FIXME: handle pragmas better. Really pragmas should
# return a pragma result + rest of data blob.
if sr:
self.searchPath = sr
_, self.data = self.data.split("\n", 1)
self.currentLineNumber = 0
filters = WikiRend.filter_routines
x = self.pull()
while x:
filters[x[0]](self, x)
x = self.pull()
if self.options & TITLEONLY and \
self.currentLineNumber >= 2:
# Time to get out.
break
self.result.extend(self.inlineEndStack)
self.result.extend(self.blockEndStack)
self.inlineEndStack = []
self.blockEndStack = []
filter_routines = {}
def blank_handler(self, tok):
__pychecker__ = "no-argsused"
# Generating newlines in the output for blank lines in
# the source makes the generated HTML look much nicer.
self.result.append("\n")
self.push_blank()
filter_routines['blank'] = blank_handler
def starsep_handler(self, tok):
__pychecker__ = "no-argsused"
self.result.append('<p align="center">* * *</p>\n')
filter_routines['starsep'] = starsep_handler
def hr_handler(self, tok):
__pychecker__ = "no-argsused"
self.result.append('<hr>\n')
filter_routines['hr'] = hr_handler
def header_handler(self, tok, special=None):
hlevel = min(len(tok[3].group(1)), 6)
htext = tok[3].group(2)
hdtag = 'h%d' % hlevel
self.handle_begin('begin', hdtag)
self.handle_text('text', htext)
return self.handle_end('end', hdtag, special=special)
filter_routines['header'] = header_handler