(module mediawiki-struct-parse mzscheme
(require (lib "yacc.ss" "parser-tools")
(lib "lex.ss" "parser-tools")
(prefix : (lib "lex-sre.ss" "parser-tools")))
(require (lib "string.ss" "mzlib"))
(require (lib "string.ss" "srfi" "13"))
(require (lib "list.ss" "mzlib"))
(require (lib "pretty.ss" "mzlib"))
(require (planet "xml.scm" ("oesterholt" "ho-utils.plt" 1 0)))
(provide mediawiki-parse
pp
substr
)
(define (pp . args)
(apply pretty-print args))
(define VERSION "1.0")
(define dbg #f)
(define (return sym val)
(if dbg (display (format "~a~%" sym)))
val
)
(define-syntax lreturn
(syntax-rules ()
((_ (tok x))
(begin
(if (and dbg (not (or (eq? 'tok 'token-CHAR) (eq? 'tok 'token-WS))))
(display (format "~a~%" 'tok)))
(tok x)))
((_ tok)
(begin
(if (and dbg (not (or (eq? 'tok 'token-CHAR) (eq? 'tok 'token-WS))))
(display (format "~a~%" 'tok)))
tok))
))
(define (substr s b . e)
(let ((l (if (null? e)
(string-length s)
(if (< (car e) 0)
(+ (string-length s) (car e))
(car e)))))
(substring s b l)))
(define (trim s)
(string-trim-both s))
(define (%parse-include S def-ns)
(parse-include (string-append "{{" S) "Template"))
(define (%get s p)
(string-trim-both (substr s p)))
(define (get-elems xelem)
(cddr xelem))
(define (xexpr-elems-append l1 l2)
(append l1 (cdr l2)))
(define-lex-abbrevs
(:ws (:or #\newline #\space #\tab))
(:wsr (:: (:* (:or #\space #\tab)) #\newline))
(:wst (:or #\space #\tab))
(:spart (:+ (:or (:/ #\a #\z) (:/ #\A #\Z) (:/ #\0 #\9))))
(:wikispecials (:or #\= #\[ #\] #\{ #\} #\| #\newline))
(:no-newline (:* (char-complement #\newline)))
(:no-newline1 (:+ (char-complement #\newline)))
(:no-pipe (:* (char-complement (:or #\| #\newline))))
(:noendpre (:or (:: (char-complement #\/)(:* (char-complement #\>)))
(:: #\/ (char-complement #\p) (:* (char-complement #\>)))
(:: "/p" (char-complement #\r) (:* (char-complement #\>)))
(:: "/pr" (char-complement #\e) (:* (char-complement #\>)))
(:: "/pre" (char-complement #\>) (:* (char-complement #\>)))
)
)
(:end-pre (:: (:* (:* (char-complement #\<)) (:: #\< :noendpre #\>))
(:* (char-complement #\<))
"</pre>"))
(:no-end-inc (:: (:* (:: (:+ (char-complement #\})) #\} (:+ (char-complement #\})))) (:* (char-complement #\}))))
(:no-italics-end (:: (:* (:: (:+ (char-complement #\')) #\' (:+ (char-complement #\')))) (:* (char-complement #\'))))
(:xt (:* (char-complement #\>)))
(:nonowiki (:or (:: (char-complement #\/) :xt)
(:: #\/ (char-complement #\n) :xt)
(:: "/n" (char-complement #\o) :xt)
(:: "/no" (char-complement #\w) :xt)
(:: "/now" (char-complement #\i) :xt)
(:: "/nowi" (char-complement #\k) :xt)
(:: "/nowik" (char-complement #\i) :xt)
(:: "/nowiki" (char-complement #\>) :xt)
)
)
(:no-nowiki-end (:: (:* (:* (char-complement #\<)) (:: #\< :nonowiki #\>))
(:* (char-complement #\<))
"</nowiki>"))
(:url (:or "mailto:" (:: :spart #\: #\/ #\/)))
(:noclosebrack (:: (:* (:: (:+ (char-complement #\])) #\] (:+ (char-complement #\])))) (:* (char-complement #\]))))
(:spartws (:: (:+ (:or :spart :wst #\. #\, #\; #\- #\_ #\# #\$ #\% #\* #\& #\^ #\( #\) #\! #\@ #\< #\> #\? #\/ #\\ #\' #\` #\~ #\"))))
)
(define-tokens value-tokens (ILINK ELINK IELINK CHAR WS WORD ITEM INCLUDE H1 H2 H3 H4 TABLE-BEGIN TABLE-ROW TABLE-COL
VERBATIM LINE PRE PAGE VAR VAL TEXT ATTRIBUTE LINKTYPE NAMESPACE TOGGLE BTAG ETAG))
(define-empty-tokens e-tokens (EOF TABLE-END EMPTY-LINE NEWLINE VERB-BEGIN VERB-END INC-BEGIN INC-END
E-BOLD B-BOLD E-ITALICS B-ITALICS E-BOLD-ITALICS B-BOLD-ITALICS HLINE))
(define attribute-lex
(lexer
((eof) 'EOF)
((:: (:+ (char-complement (:or :wst #\=))) (:* :wst) "=" (:* :wst)) (lreturn (token-VAR (trim (substr lexeme 0 -1)))))
((:: #\" (:* (char-complement #\")) #\") (lreturn (token-VAL (substr lexeme 1 -1))))
((:+ (char-complement (:or :wst #\= #\| #\"))) (lreturn (token-VAL lexeme)))
(any-char (lreturn (token-CHAR lexeme)))
))
(define attribute-parser
(parser
(start start)
(end EOF)
(tokens value-tokens e-tokens)
(error (lambda (a b c) (display (format "ERROR! ~a ~a ~a~%" a b c))(void)))
(grammar
(start (() (list))
((error start) $2)
((vars) $1)
)
(vars ((VAR VAL vars) (cons (xexpr-attr (string->symbol $1) $2) $3))
((CHAR vars) $2)
((CHAR) (list))
((VAL vars) $2)
((VAL) (list))
((VAR VAL) (list (xexpr-attr (string->symbol $1) $2)))
)
)
))
(define (parse-attributes str def-ns)
(if (string? str)
(parse-attributes (open-input-string str) def-ns)
(begin
(port-count-lines! str)
(let ((R (attribute-parser (lambda () (attribute-lex str)))))
(display R)(newline)
(let ((RR (apply xexpr-attrs R)))
(display RR)(newline)
RR)))
))
(define (parse-elink l def-ns)
(let ((L (regexp-match "([^ \t]+)[ \t]+(.*)" l)))
(if (eq? L #f) (set! L (list l l #f)))
(let ((link (cadr L))
(text (if (eq? (caddr L) #f) "" (caddr L))))
(xexpr-elem 'elink
(xexpr-attrs (xexpr-attr 'link link))
(xexpr-elem 'text (parse-text (trim text) def-ns))))))
(define ilink-lex
(lexer
((eof) 'EOF)
((:: #\| (:* (:~ #\|))) (lreturn (token-TEXT (substr lexeme 1))))
((:: :spartws ":=") (lreturn (token-ATTRIBUTE (trim (substr lexeme 0 -2)))))
((:: :spartws "::") (lreturn (token-LINKTYPE (trim (substr lexeme 0 -2)))))
((:: :spartws ":") (lreturn (token-NAMESPACE (trim (substr lexeme 0 -1)))))
((:: ":") (lreturn (token-NAMESPACE "")))
((:: :spartws) (lreturn (token-PAGE (trim lexeme))))
))
(define (ilink-parse def-ns)
(parser
(start start)
(end EOF)
(tokens value-tokens e-tokens)
(error (lambda (a b c) (display (format "ERROR! ~a ~a ~a~%" a b c))(void)))
(grammar
(start ((error start) $2)
((link) $1)
)
(link ((attributes page) (xexpr-elem 'meta $2
(xexpr-elems $1)))
((attributes page texts) (xexpr-elem 'meta $2 (xexpr-elems $1 $3)))
((linktypes page) (xexpr-elem 'ilink $2 (xexpr-elems $1)))
((linktypes page argtext) (xexpr-elem 'ilink $2
(apply xexpr-elems (cons $1 $3))))
)
(page ((PAGE) (xexpr-elem 'page (xexpr-attrs (xexpr-attr 'link $1)) (xexpr-elems (xexpr-elem 'ns def-ns))))
((namespaces PAGE) (xexpr-elem 'page (xexpr-attrs (xexpr-attr 'link $2)) (apply xexpr-elems $1) ))
)
(namespaces ((namespace) (list $1))
((namespace namespaces) (cons $1 $2))
)
(namespace ((NAMESPACE) (xexpr-elem 'ns (if (string=? $1 "") def-ns $1)))
)
(texts ((TEXT texts1) (xexpr-elem 'text (parse-text (string-append $1 $2) def-ns)))
)
(texts1 ((TEXT texts1) (string-append "|" $1 $2))
(() "")
)
(attributes ((ATTRIBUTE attrs) (xexpr-elem 'attributes (apply xexpr-elems (cons (xexpr-elem 'attribute $1) $2))))
)
(attrs (() (list))
((ATTRIBUTE attrs) (cons (xexpr-elems 'attribute $1) $2))
)
(argtext ((TEXT) (list (xexpr-elem 'text (parse-text $1 def-ns))))
((TEXT argtext) (cons (xexpr-elem 'arg $1) $2))
)
(linktypes ((ltypes) (xexpr-elem 'linktypes (apply xexpr-elems $1)))
)
(ltypes (() (list))
((LINKTYPE ltypes) (cons (xexpr-elem 'type $1) $2))
)
)
))
(define (parse-ilink str def-ns)
(if (string? str)
(parse-ilink (open-input-string str) def-ns)
(begin
(port-count-lines! str)
(let ((R ((ilink-parse def-ns) (lambda () (ilink-lex str)))))
R))
))
(define in-italics #f)
(define in-bold #f)
(define in-bi #f)
(define text-lex
(lexer
((eof) 'EOF)
((:: "<nowiki>" :no-nowiki-end "</nowiki>") (lreturn (token-CHAR lexeme)))
("''" (lreturn (begin (set! in-italics (not in-italics))
(if in-italics (token-B-ITALICS) (token-E-ITALICS)))))
("'''" (lreturn (begin (set! in-bold (not in-bold))
(if in-bold (token-B-BOLD) (token-E-BOLD)))))
("'''''" (lreturn (begin (set! in-bi (not in-bi))
(if in-bi (token-B-BOLD-ITALICS) (token-E-BOLD-ITALICS)))))
((:: #\[ :url (:+ (char-complement #\])) #\]) (lreturn (token-ELINK (trim (substr lexeme 1 -1)))))
((:: :url (:+ (char-complement :wst))) (lreturn (token-ELINK lexeme)))
((:: "[[" :url :noclosebrack "]]") (lreturn (token-IELINK (trim (substr lexeme 2 -2)))))
((:: "[[" :noclosebrack "]]") (lreturn (token-ILINK (trim (substr lexeme 2 -2)))))
((:: "__" :spart "__") (lreturn (token-TOGGLE (trim (substr lexeme 2 -2)))))
((:: #\< (:+ (:~ #\newline #\>)) #\>) (lreturn (if (char=? (string-ref lexeme 1) #\/)
(token-ETAG (trim (substr lexeme 2 -1)))
(token-BTAG (trim (substr lexeme 1 -1))))))
(any-char (lreturn (token-CHAR lexeme)))
))
(define (text-parser def-ns)
(parser
(start start)
(end EOF)
(tokens value-tokens e-tokens)
(error (lambda (a b c) (display (format "ERROR! ~a ~a ~a~%" a b c))(void)))
(grammar
(start ((error start) $2)
((text) (apply xexpr-elems $1))
)
(text ((markup text) (cons $1 $2))
((string text) (cons (xexpr-elem 'nm $1) $2))
((ELINK text) (cons (parse-elink $1 def-ns) $2))
((ILINK text) (cons (parse-ilink $1 def-ns) $2))
((IELINK text) (cons (xexpr-elem 'nm "[")
(cons (parse-elink $1 def-ns)
(cons (xexpr-elem 'nm "]") $2) ) ) )
((TOGGLE text) (cons (xexpr-elem 'toggle $1) $2))
((BTAG text) (cons (xexpr-elem 'xml-begin $1) $2))
((ETAG text) (cons (xexpr-elem 'xml-end $1) $2))
(() (list))
)
(markup ((B-ITALICS text E-ITALICS) (xexpr-elem 'i (apply xexpr-elems $2)))
((B-BOLD text E-BOLD) (xexpr-elem 'b (apply xexpr-elems $2)))
((B-BOLD-ITALICS text E-BOLD-ITALICS) (xexpr-elem 'bi (apply xexpr-elems $2)))
)
(string ((CHAR) $1)
((CHAR string) (string-append $1 $2))
)
)
))
(define (parse-text str def-ns)
(if (string? str)
(begin
(parse-text (open-input-string str) def-ns))
(begin
(port-count-lines! str)
(let ((R ((text-parser def-ns) (lambda () (text-lex str)))))
R))
))
(define include-lex
(lexer
((eof) 'EOF)
((:: "{{" (:+ (char-complement #\newline))) (lreturn (token-PAGE (trim (substr lexeme 2)))))
((:: #\newline "|" (char-complement #\=) "=") (lreturn (token-VAR (substr lexeme 1 -1))))
(any-char (lreturn (token-CHAR lexeme)))
))
(define (include-parser def-ns)
(parser
(start start)
(end EOF)
(tokens value-tokens e-tokens)
(error (lambda (a b c) (display (format "ERROR! ~a ~a ~a~%" a b c))(void)))
(grammar
(start (() (list))
((error start) $2)
((PAGE vars) (return 'include (xexpr-elem 'include (parse-ilink $1 def-ns) (apply xexpr-elems $2 ))) )
)
(vars (() (list))
((VAR text vars) (cons (xexpr-elem 'arg (xexpr-attrs (xexpr-attr 'name (trim (substr $1 1))))
(apply xexpr-elems (get-elems (mediawiki-parse $2)))) $3))
)
(text ((CHAR) "")
((CHAR text) (string-append $1 $2))
)
)
))
(define (parse-include str def-ns)
(if (string? str)
(parse-include (open-input-string str) def-ns)
(begin
(port-count-lines! str)
((include-parser def-ns)(lambda () (include-lex str))))
))
(define (parse-item str def-ns)
(let ((L (regexp-match "([#*]+)(.*)" str)))
(let ((bullets (cadr L))
(text (parse-text (caddr L) def-ns)))
(xexpr-elem 'item (xexpr-attrs (xexpr-attr 'bullets bullets)) text))))
(define mediawiki-struct-lex
(lexer
((eof) 'EOF)
((:: #\newline "====" :no-newline1 "====") (lreturn (token-H4 (trim (substr lexeme 5 -4)))))
((:: #\newline "===" :no-newline1 "===") (lreturn (token-H3 (trim (substr lexeme 4 -3)))))
((:: #\newline "==" :no-newline1 "==") (lreturn (token-H2 (trim (substr lexeme 3 -2)))))
((:: #\newline "=" :no-newline1 "=") (lreturn (token-H1 (trim (substr lexeme 2 -1)))))
((:: #\newline "{|" :no-newline) (lreturn (token-TABLE-BEGIN (trim (substr lexeme 3)))))
((:: #\newline "|}" :no-newline) (lreturn (token-TABLE-END)))
((:: #\newline "|-" :no-newline) (lreturn (token-TABLE-ROW (trim (substr lexeme 3)))))
((:: #\newline "|" :no-newline) (lreturn (token-TABLE-COL (trim (substr lexeme 2))))) ((:: #\newline :wst :no-newline) (lreturn (token-VERBATIM (trim lexeme))))
((:: #\newline (:+ (:or "*" "#")) :no-newline) (lreturn (token-ITEM lexeme)))
((:: #\newline "----") (lreturn (token-HLINE)))
((:: "<pre>" :end-pre) (lreturn (token-PRE (substr lexeme 5 -5))))
((:: "{{" :no-end-inc "}}") (lreturn (token-INCLUDE (substr lexeme 2 -2))))
(any-char (lreturn (token-CHAR lexeme)))
))
(define (mediawiki-struct-parse def-ns)
(parser
(start start)
(end EOF)
(tokens value-tokens e-tokens)
(error (lambda (a b c) (display (format "ERROR! ~a ~a ~a~%" a b c))(void)))
(grammar
(start (() (list))
((error start) $2)
((wikitext) (return 'wikitext (xexpr-elem 'wikitext
(xexpr-attrs (xexpr-attr 'version VERSION))
(apply xexpr-elems $1))))
)
(wikitext ((elem) (list $1))
((elem wikitext) (cons $1 $2))
)
(elem ((H1) (return 'h1 (xexpr-elem 'h1 (parse-text $1 def-ns))))
((H2) (return 'h2 (xexpr-elem 'h2 (parse-text $1 def-ns))))
((H3) (return 'h3 (xexpr-elem 'h3 (parse-text $1 def-ns))))
((H4) (return 'h4 (xexpr-elem 'h4 (parse-text $1 def-ns))))
((HLINE) (return 'hline (xexpr-elem 'hline)))
((TABLE-BEGIN table TABLE-END) (return 'table (xexpr-elem 'table
(parse-attributes $1 def-ns) (apply xexpr-elems $2))))
((INCLUDE) (%parse-include $1 def-ns))
((text) $1)
((verbatim) $1)
((PRE) (return 'verbatim (xexpr-elem 'pre (substr $1 5 -6))))
((items) (return 'items (xexpr-elem 'items (apply xexpr-elems (item-reformer $1 def-ns)))))
)
(items ((ITEM items) (cons (parse-item $1 def-ns) $2))
((ITEM) (list (parse-item $1 def-ns)))
)
(text ((ttext) (return 'text (xexpr-elem 'text (parse-text $1 def-ns)))))
(ttext ((CHAR ttext) (string-append $1 $2))
((CHAR) $1))
(verbatim ((verbs) (return 'verbatim (xexpr-elem 'verbatim $1)))
)
(verbs ((VERBATIM) $1)
((VERBATIM verbs) (string-append $1 $2))
)
(table ((rows) $1)
)
(rows (() (list))
((TABLE-ROW cols rows) (return 'row (cons (xexpr-elem 'trow (parse-attributes $1 def-ns) (apply xexpr-elems $2))
$3)))
)
(cols (() (list))
((TABLE-COL wikitext cols) (return 'col (cons (xexpr-elem 'tcol (parse-attributes $1 def-ns) (apply xexpr-elems $2))
$3)))
((TABLE-COL cols) (return 'col (cons (xexpr-elem 'tcol (parse-attributes $1 def-ns))
$2)))
)
)
))
(define (item-bullets item)
(let* ((attrs (cadr item))
(bul (car attrs))
(val (cadr bul)))
(map (lambda (b)
(if (eq? b #\*)
'ul
'nl))
(string->list val))))
(define (item-value item)
(cddr item))
(define (new-branch bullets item)
(if (null? bullets)
(list 'item item)
(list
(list (car bullets) (new-branch (cdr bullets) item)))
))
(define (item-append struct el)
(append struct el))
(define (item-adder struct bullets item)
(if (null? bullets)
(if (null? struct)
(list 'item item)
(item-append struct (list (list 'item item))))
(if (null? struct)
(new-branch bullets item)
(let ((node (car (reverse struct))))
(if (eq? (car node) (car bullets))
(begin
(set-cdr! node (item-adder (cdr node) (cdr bullets) item))
struct)
(item-append struct (new-branch bullets item)))))))
(define (walk-items struct)
(define (wi el)
(if (eq? (car el) 'ul)
(xexpr-elem 'ul (apply xexpr-elems (walk-items (cdr el))))
(if (eq? (car el) 'nl)
(xexpr-elem 'nl (apply xexpr-elems (walk-items (cdr el))))
(cadr el))))
(map wi struct))
(define (item-reformer items def-ns)
(let ((struct '()))
(for-each (lambda (i)
(set! struct (item-adder struct (item-bullets i) i)))
items)
(walk-items struct)))
(define (mediawiki-parse ip . ns)
(let ((def-ns (if (null? ns) "default" (car ns))))
(if (string? ip)
(mediawiki-parse (open-input-string ip) def-ns)
(begin
(port-count-lines! ip)
((mediawiki-struct-parse def-ns) (lambda () (mediawiki-struct-lex ip))))
)))
)