simple-parser.ss
#lang scheme

(require "common.ss")

(provide new-parser
         add-item
         add-items
         re txt kw
         switch-phase
         sub-parse sub-parse-return cons-out
         parse-text)

;[:title Simple Text Parser]

;: This module provides a simple text parser that can read strings
;: and turn them into data without first building lexems (although it
;: can be used to either lex or parse).
;:
;: More complex or faster parsers may require the use of the intergrated
;: @schememodname[parser-tools].
;:
;: A parser is given a list of matcher procedures and associated action procedures.
;: A matcher is generally a regexp, the associated action turns
;: the matched text into something else.
;: On the input string, the parser recursively looks
;: for the matcher that matches the earliest
;: character and applies its action.
;: $no-match-proc is applied to the portion of the string (before the first matched
;: character) that has not been matched.
;:
;: The parser has an internal state, the "phase", where it is possible
;: to define local parsers that only work when the parser is in that phase.
;: Actions can make the parser switch to a given phase.
;: Automata transitions can then easily be defined.
;:
;: Instead of switching to another phase, it is also possible to set the
;: parser into a "sub-parser" mode, and to provide the sub-parser with a callback
;: that will be applied only once the sub-parser has returned.
;:
;:
;: The fastest and easiest way to understand how it works is probably to
;: look at the examples in the @filepath{examples} directory.
;: Somes simple examples are also given at the end of this page.
;: See also the @filepath{defs-parser.ss} source file for a more complex usage.


;[:section Priorities]
;: When parsing a string, among all matchers of the current phase,
;: the matcher which action is triggered is the one that matches the earliest
;: character in the string.
;: If several matchers apply, then only the @bold{last} added matcher is chosen.
;: In $add-items, the priority is for the matcher that is defined the lowest in the
;: source file.

; EXAMPLES

;[:convention parser parser?]
;: @defproc[(parser? [p any/c]) boolean?]{
;: Returns $#t if $p is a parser, #f otherwise.
;: }

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;   Keyword Searcher in Texts   ;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; useful for syntax colorization in slideshow
; each keyword is associated with a function
; so it is more general

;[:section Main Functions]

(define-struct item ; trouver un autre nom car trop ambigu et deja utilisé ?
  ; name : any
  ; search-proc : string -> (( pos-begin . pos-end))
  (search-proc do-proc)
  #:transparent)



; phase is a function (void) -> bool


(define-struct parser 
  ; output is a parameter containing the current output
  (items no-match-proc phase output appender sub-parsers)
  #:mutable
  #:transparent)

(define (new-parser [no-match-proc identity] #:phase [phase 'start]
                    #:appender [appender string-append]) ;:-> parser?
  ;: [no-match-proc procedure?]
  ;: [appender procedure?]
  ;: Creates a new parser with default behavior $no-match-proc, starting in phase $phase.
  ;: All the outputs generated byt the parser are then appended with $appender.
  (make-parser '()
               no-match-proc
               (make-parameter phase)
               (make-parameter '())
               appender '()))

; Sub-parsers stack:
(define (push-sub-parser parser sub)
  (set-parser-sub-parsers! parser (cons sub (parser-sub-parsers parser))))
(define (pop-sub-parser parser)
  (let ([subs (parser-sub-parsers parser)])
    (set-parser-sub-parsers! parser (rest subs))
    (first subs)))
    

; Each keyword is associated to a procedure that will be applied
; to the keyword string when it will be found in the text.

; General function to add an item;
; match-proc : string -> (list (or #f (start . end)))
; (just like regexp-match-position)
(define (add-item-general parser search-proc output-proc)
  (set-parser-items! parser
                     (cons  (make-item search-proc output-proc)
                            (parser-items parser))))

; useful to easily define phase comparisons
(define (eq-phase? phase)
  (λ(p)(equal? phase p)))

(define (add-item-phase parser search-proc output-proc phase?)
  (add-item-general parser 
                    (λ(text)(if (phase? ((parser-phase parser)))
                                (search-proc text)
                                #f))
                    output-proc))

(define current-parser (make-parameter #f))

(define (to-out out) ; turns a user-given out into a correct out
  (cond [(procedure? out) out]
        [else (to-proc out)]))

(define (to-phase? phase?) ; turns a user-given phase? into a phase? test-proc
  (cond [(procedure? phase?) phase?]
        [(equal? #t phase?) (λ args #t)]
        [else (eq-phase? phase?)]))
  
(define (add-no-match-cond parser phase? out)
  (let ([no-match (parser-no-match-proc parser)])
    (set-parser-no-match-proc! 
     parser (λ args (if (phase? ((parser-phase parser)))
                        (apply out args)
                        (apply no-match args))))))

(define (add-item parser phase? in out) ;:-> void?
  ;: [in (or/c #t procedure? list? symbol? string?)]
  ;: [out (or/c procedure? symbol? string?)]
  ;: Adds the matcher $in and its associated action $out
  ;: to $parser.
  ;: The matcher will match only when the parser is in a phase that
  ;: returns $#t when applied to $phase?.
  ;:
  ;: If $phase? is a procedure, it will be used as is to match the parser's phase.
  ;: If $phase? equals $#t it will be changed to @scheme[(λ args #t)]
  ;: such that it matches any phase.
  ;: Any other value of $phase will be turned into a procedure that matches
  ;: this value with $equal?.
  ;:
  ;: If $in is a string it will be turned into a procedure that matches
  ;: the corresponding pregexp.
  ;: If $in is a symbol, it will be turned into a procedure that matches
  ;: the corresponding pregexp with word boundaries on both sides, (useful
  ;: for matching names or programming languages keywords).
  ;: If $in is a list, then $add-item is called recursively on each member
  ;: of $in with the same $parser, $phase? and $out.
  ;: If $in equals $#t, it will modify the $no-match-proc procedure
  ;: to add the corresponding action when $phase? applies to the parser.
  ;: In the end, $in has returns the same kind of values as $regexp-match-positions.
  ;:
  ;: $out must be a procedure that accepts the same number of arguments as
  ;: the number of values returned by the matcher $in.
  ;: For example, if $in is @scheme["aa(b+)c(d+)e"], then $out must
  ;: take 3 arguments (one for the whole string, and two for the b's and the d's).
  ;: If $out is not a procedure, it will be turned into a procedure that accepts
  ;: any number of arguments and returns $out.
  (cond [(list? in) (for-each (λ(in)(add-item parser phase? in out)) in)]
        [(equal? #t in) (add-no-match-cond parser (to-phase? phase?) (to-out out))]
        [else
         (add-item-phase parser
                         (cond [(procedure? in) in]
                               [(symbol? in) (kw in)]
                               [(string? in) (re in)]
                               [else (error "unknown type for " in)])
                         (to-out out)
                         (to-phase? phase?))]
        ))

(define-syntax-rule (add-items parser [phase? [search-proc output-proc] ...] ...)
  ;: The general form for adding several items at once.
  ;: See the examples at the end of this page.
  (parameterize ([current-parser parser])
    (let ([p phase?]) ; externalize phase? from the inner "..."
      ; otherwise it gets counted
      (begin (add-item parser p search-proc output-proc) ...))
    ...))

(define (parse-text parser #:phase [phase ((parser-phase parser))] . text)
  ;:-> (listof any/c)
  ;: [text string?]
  ;: Parses $text with $parser, starting in phase $phase, which is the current phase
  ;: by default.
  ;:
  ;: It is thus possible to call the parser inside the parsing phase, i.e
  ;: once a portion of the text has been parsed, it can be given to the parser
  ;: itself in some phase to make further transformations.
  ;: This is not the same as sub-parsing because there is no callback.
  (parameterize ([current-parser parser]
                 [(parser-phase parser) phase]
                 [(parser-output parser) '()])
    (let ([line-number 0]
          [the-line ""])
      (with-handlers ([exn:fail? (λ(e)(printf "PARSE-ERROR: around line ~a~n~a~n" 
                                             (round (/ line-number 2))
                                             the-line)
                                   (raise e))])
        (for ([line (add-between (append-map (λ(line)(regexp-split "\n" line)) text) 
                                 "\n")])
          (++ line-number) ; for debug
          (set! the-line line) ; for debug
          (parse-line parser line))))
    (output-append parser ((parser-output parser)))
    ))

;[:section Matchers]
;: This section describes matching functions that can be used in the
;: $in argument of $add-item and $add-items.

; IN procs:
(define (re s) ;:-> procedure?
  ;: Turns $s into a pregexp and returns a procedure
  ;: that takes an input string and applies
  ;: $regexp-match-positions on that string with the pregexp $s.
  (λ(text)(regexp-match-positions (pregexp s) text)))
(define (txt s) ;:-> procedure?
  ;: Same as $re but regexp-quotes $s beforehand, so that the string $s
  ;: is matched exactly.
  (re (regexp-quote s)))
(define (kw s) ;:-> procedure?
  ;: Same as $txt but adds word-boundaries around $s.
  (re (string-append "\\b" (regexp-quote (to-string s)) "\\b")))

;[:section Actions]
;: This section describes action functions that can be used in the
;: $out argument of $add-item and $add-items.

(define (switch-phase phase) ;:-> string?
  ;: Sets the parser in the phase $phase and returns @scheme[""].
  ((parser-phase (current-parser)) phase)
  "")

(define (sub-parse new-phase [callback identity] 
                   #:appender [appender (parser-appender (current-parser))])
  ;:-> string?
  ;: [appender procedure?]
  ;: [callback procedure?]
  ;: Sets the current parser in sub-parse mode and switches to $new-phase.
  ;: The result of the sub-parse is appended with $appender, which by default
  ;: is the same as the parser's.
  ;: When the sub-parser has finished parsing
  ;: (it has returned with $sub-parse-return),
  ;: $callback is called with the result of the sub-parse and the result of
  ;: $callback is added to the current parser result.
  ;:
  ;: Sub-parsers can be called recursively, once in a sub-parsing mode
  ;: or in the $callback.
  ;:
  ;: Returns @scheme[""].
  (start-sub-parser (current-parser) new-phase callback appender)
  "")

(define (cons-out out) ;:-> void?
  ;: By default, the parser agglomerates the return values
  ;: of the action procedures.
  ;: The function $cons-out can be used to add a value to the parser
  ;: without being a return value of an action.
  ;: Should be rarely useful.
  (cons-output (current-parser) out))

(define (sub-parse-return [out #f])
  ;: Adds $out to the current parser result and returns
  ;: from the current sub-parsing mode.
  (when out (cons-out out))
  (terminate-sub-parser (current-parser))) ; this is also the return value!
  

(define-struct sub-parser
  (parser old-output old-phase new-phase callback appender)
  #:mutable)

(define (start-sub-parser parser new-phase callback appender)
  (let ([sub (make-sub-parser parser ((parser-output parser)) 
                              ((parser-phase parser)) new-phase callback
                              appender)])
    (push-sub-parser parser sub)
    ((parser-phase parser) new-phase)
    ((parser-output parser) '()) ; fresh-new-output
    ))

(define (terminate-sub-parser parser)
  (let* ([sub-parser (pop-sub-parser parser)]
         [output ((parser-output parser))])
;    (printf "Terminating sub-parser: ~a -> ~a ~n ~a ~n"
;            (sub-parser-new-phase sub-parser)
;            (sub-parser-old-phase sub-parser)
;            (parser-sub-parsers parser)
;            )
    ; restore previous settings
    ((parser-phase parser) (sub-parser-old-phase sub-parser))
    ((parser-output parser) (sub-parser-old-output sub-parser))
    ; call to the callback function with the accumulated local output:
    ((sub-parser-callback sub-parser) 
     (output-append parser output 
                    #:appender (sub-parser-appender sub-parser)))
    ))
  


(define (find-first-matcher parser text)
  (let ([matches
         (filter car 
                 (for/list ([item (parser-items parser)])
                   (let ([i-match ((item-search-proc item) text)])
                     (list (if i-match i-match #f) item))))])
    (if (empty? matches)
        (list (list (cons (string-length text) (string-length text))) #f)
        (argmin caaar matches)))) ; argmin stops on the first min, so priority to the last added items (cons)

(define (positions->strings str lpos)
  (map (λ(pos)(if pos (substring str (car pos) (cdr pos)) pos)) lpos))

(define (cons-output parser x)
  ((parser-output parser)
   (cons x ((parser-output parser)))))

(define (parse-line parser text)
  (let loop ([text text])
    (let* ([first-matcher (find-first-matcher parser text)]
           [start (caaar first-matcher)]
           [end (cdaar first-matcher)])
      (cons-output parser 
                   ((parser-no-match-proc parser)
                    (substring text 0 start)))
      (when (< start end) ; there is a match
        (cons-output parser
                     (apply (item-do-proc (second first-matcher))
                            (positions->strings text (car first-matcher)))))
      (let ([rest-str (substring text end)])
        (unless (string=? "" rest-str)
          (loop rest-str)))
      )))
  
(define (output-append parser output #:appender [appender (parser-appender parser)])
  (apply appender (reverse output)))


;[:section Examples]

;[:examples]
;(let ([p (new-parser)])
;  (add-items
;   p
;   ('start
;    ["pl(.[^p]?)p" (λ(s x)(string-append " -gl" x "tch- "))]
;    ["ou" "aï"]
;    [#t string-upcase]))
;  (parse-text p "youcoudouplipcoudouploup" "toupouchou"))
;
;(let ([tree-parser
;       (new-parser #:appender
;                   (λ vals (remove* '(||) vals)))])
;  (add-items
;   tree-parser
;   ('start
;    [#t string->symbol]
;    ["\\s+" '||]
;    ["\\(" (λ(s)(sub-parse 'start)'||)]
;    ["\\)" (λ(s)(sub-parse-return))]
;    ))
;  (parse-text
;   tree-parser
;   "tree:(root (node1 (leaf1 leaf2)
;leaf3) (node2
; leaf4 (node3 leaf5) leaf6) leaf7)"))
;[:end-examples]
;YaïCaïDaï -glitch- CaïDaï -gloutch-
;TaïPaïCHaï

;: Note that the result of the last example is Scheme data, not a string.