;;; markup - hypertext in scheme, producing latex and html
;;; Copyright (C) 1995  Scott Draves <spot@cs.cmu.edu>
;;;
;;; This program is free software; you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License as published by
;;; the Free Software Foundation; either version 2 of the License, or
;;; (at your option) any later version.
;;;
;;; This program is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;; GNU General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with this program; if not, write to the Free Software
;;; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

; quick and dirty html parser, including converter to .mar written
; just so i don't have to translate my proposal by hand but could be
; useful.  -spot

; biggest bugs: fails if `<' appears in a <pre> block.  doesn't handle
; tabs in input.

(define read-to
  (lambda (term-char?)
    (lambda ()
      (let loop ((next (peek-char))
		 (buf '()))
	(cond ((term-char? next)
	       (list->string (reverse buf)))
	      (else (let ((b (cons (read-char) buf)))
		      (loop (peek-char) b))))))))

(define read-key-list
  (lambda ()
    (let loop ((key-list '()))
      (skip-whitespace)
      (cond ((eq? #\> (peek-char))
	     (read-char)
	     (reverse key-list))
	    (else
	     (let ((key (read-sym)))
	       (if (eq? #\= (peek-char))
		   (begin (read-char)
			  (loop (cons (cons key (read-val)) key-list)))
		   (loop (cons (list key) key-list)))))))))

(define read-to-sym-sep
  (read-to (lambda (c)
	     (or (char-whitespace? c)
		 (eq? c #\>)
		 (eq? c #\=)))))

;;; terribly slow, there should be a better way, at least it should
;;; part of the std library
(define (downcase s)
  (let ((down-char (lambda (c)
		     (let ((a (char->ascii c)))
		       (if (<= 65  a 90)
			   (ascii->char (+ a 32))
			   c)))))
    (list->string (map down-char
		       (string->list s)))))

(define read-sym
  (lambda ()
    (string->symbol (downcase (read-to-sym-sep)))))

(define read-val
 (lambda ()
   (if (eq? (peek-char) #\")
       (read)
       (read-sym))))

(define skip-whitespace
  (read-to (lambda (c) (not (char-whitespace? c)))))

(define read-to-white-or-lt
  (read-to (lambda (c) (or (char-whitespace? c)
			   (eq? #\< c)))))

(define (read-to-char c)
  (read-to (lambda (d) (eq? c d))))



; takes stream of chars converts to list of white-space characters,
; strings, or a sexpr formatted like (name . ((key . val) ...))

(define read-html-token
  (lambda ()
    (let ((next (peek-char)))
      (cond ((eof-object? next) next)
	    ((eq? next #\<)
	     (read-char)
	     (read-key-list))
	    ((char-whitespace? next) (read-char))
	    ;; this isn't necc but could come in handy
	    (else (read-to-white-or-lt))))))

;;; clean me
(define (read-balanced-html)
  
  (define (left? name)
    (memq name '(code em a pre ol dl ul h1 h2 h3 h4 h5 h6 title)))
  ;; better way? symbol-append of some sort?
  (define (make-right-name left-name)
    (string->symbol (string-append "/" (symbol->string left-name))))

  (define (list-to-matching stack right-name)
    (let* ((tok (read-html-token))
	   (cont (lambda (r)
		   (list-to-matching (cons r stack) right-name))))
      (cond ((pair? tok)
	     (let ((name (token-name tok))
		   (rest (cdr tok)))
	       (cond ((left? name)
		      (let ((block (list-to-matching '() (make-right-name name))))
			(cont `(,(car tok) (block . ,block) . ,rest))))
		     ((eq? right-name name) (reverse stack))
		     (else (cont tok)))))
	    ((eof-object? tok)
	     (error "unexpected eof ~S" right-name))
	    (else (cont tok)))))

  (let ((tok (read-html-token)))
    (if (and (pair? tok)
	     (left? (token-name tok)))
	(let ((block (list-to-matching '() (make-right-name (token-name tok)))))
	  `(,(car tok) (block . ,block) . ,(cdr tok)))
	tok)))

(define (html-reader . port)
  (if (null? port)
      (read-balanced-html)
      (with-stdin (car port)
		  (read-balanced-html))))

(define (read-html-file file)
  (with-stdin (open-input-file file)
	      (html-reader)))

(define token-name caar)

(define (token-block token)
  (cond ((assq 'block token) => cdr)
	(else (error "block missing: ~S" token))))

(define (html-token->markup token fresh-line)
  (define (easy latex-name)
    (format #t "{~A " latex-name)
    (html->markup (token-block token))
    (write-string "}"))
  (cond ((eq? token #\newline) (if (not fresh-line)
				   (newline))
			       #t)
	((eq? token #\space) (write-char #\space) #f)
	((string? token) (write-string token) #f)
	((pair? token)
	 (case (token-name token)
	   ((a) (cond ((assq 'href token)
		       => (lambda (p)
			    (format #t "{link ~A " (cdr p))
			    (html->markup (token-block token))
			    (write-string "}")))
		      (else (html->markup (token-block token)))))
	   ((em) (easy "em"))
	   ((p hr) (newline))
	   ((li) (write-string " o "))
	   ((dt) (write-string " o {"))
	   ((dd) (write-string "}"))
	   ((pre) (easy "code"))
	   ((code) (easy "c"))
	   ((ol) (easy "enumerate"))
	   ((ul) (easy "itemize"))
	   ((dl) (easy "definitions"))
	   ((title) (warn "title ignored"))
	   ((h1 h2 h3 h4 h5 h6)
	    (write-string "{section xxx ")
	    (html->markup (token-block token))
	    (write-string "}"))
	   (else (warn "unknown html name ~S"
		       (token-name token))))
	 #f)
	(else (error "bad html token ~S" token))))

;;; too many varargs
(define reduce-list
  (lambda (op list . seeds)
    (if (null? list)
	(values seeds)
	(receive new-seeds
		 (apply op (cons (car list) seeds))
		 (apply reduce-list op (cdr list) new-seeds)))))

(define (html->markup token-list)
  (reduce-list html-token->markup token-list #f))
