;;; @Package WebScraperHelper ;;; @Subtitle Simple Generation of SXPath Queries from SXML Examples ;;; @HomePage http://www.neilvandyke.org/webscraperhelper/ ;;; @Author Neil Van Dyke ;;; @Version 0.5 ;;; @Date 2009-03-14 ;;; @PLaneT neil/webscraperhelper:1:2 ;; $Id: webscraperhelper.ss,v 1.73 2009/03/14 08:56:37 neilpair Exp $ ;;; @legal ;;; Copyright @copyright{} 2004--2009 Neil Van Dyke. This program is Free ;;; Software; you can redistribute it and/or modify it under the terms of the ;;; GNU Lesser General Public License as published by the Free Software ;;; Foundation; either version 3 of the License (LGPL 3), or (at your option) ;;; any later version. This program is distributed in the hope that it will be ;;; useful, but without any warranty; without even the implied warranty of ;;; merchantability or fitness for a particular purpose. See ;;; @indicateurl{http://www.gnu.org/licenses/} for details. For other licenses ;;; and consulting, please contact the author. ;;; @end legal #lang scheme/base ;;; @section Introduction ;;; ;;; WebScraperHelper is intended as a programmer's aid for crafting ;;; @uref{http://pair.com/lisovsky/query/sxpath/, SXPath} queries to extract ;;; information (e.g., news items, prices) from HTML Web pages that have been ;;; parsed by @uref{http://www.neilvandyke.org/htmlprag/, HtmlPrag}. The ;;; current version of WebScraper accepts an example ;;; @uref{http://pobox.com/~oleg/ftp/Scheme/SXML.html, SXML} (or ;;; @uref{http://www.neilvandyke.org/shtml/, SHTML}) document and an example ;;; ``goal'' subtree of the document, and yields up to three different SXPath ;;; queries. A generated query can often be incorporated into a Web-scraping ;;; program as-is, for extracting information from documents with very similar ;;; formatting. Generated queries can also be used as starting points for ;;; hand-crafted queries. ;;; ;;; For example, given the SXML document @var{doc}: ;;; ;;; @smalllisp ;;; (define doc ;;; '(*TOP* (html (head (title "My Title")) ;;; (body (@@ (bgcolor "white")) ;;; (p "Summary: This is a document.") ;;; (div (@@ (id "ResultsSection")) ;;; (h2 "Results") ;;; (p "These are the results.") ;;; (table (@@ (id "ResultTable")) ;;; (tr (td (b "Input:")) ;;; (td "2 + 2")) ;;; (tr (td (b "Output:")) ;;; (td "Four"))) ;;; (p "Lookin' good!")))))) ;;; @end smalllisp ;;; ;;; @noindent ;;; evaluating the expression ;;; ;;; @lisp ;;; (webscraperhelper '(td "Four") doc) ;;; @end lisp ;;; ;;; @noindent ;;; will display generated queries like: ;;; ;;; @example ;;; Absolute SXPath: (html body div table (tr 2) (td 2)) ;;; Absolute SXPath with IDs: (html body ;;; (div (@@ (equal? (id "ResultsSection")))) ;;; (table (@@ (equal? (id "ResultTable")))) ;;; (tr 2) (td 2)) ;;; Relative SXPath with IDs: (// (table (@@ (equal? (id "ResultTable")))) ;;; (tr 2) (td 2)) ;;; @end example ;;; ;;; @noindent ;;; The queries can then be compiled with the @code{sxpath} procedure of the ;;; SXPath library: ;;; ;;; @lisp ;;; (define query ;;; (sxpath '(// (table (@@ (equal? (id "ResultTable")))) ;;; (tr 2) (td 2)))) ;;; ;;; (query doc) @result{} ((td "Four")) ;;; @end lisp ;;; ;;; This version of WebScraperHelper requires R5RS, SRFI-11, and SRFI-16. ;;; ;;; WebScraperHelper also comes with an advertising jingle (with apologies to ;;; greasy ground bovine additive Americana): ;;; ;;; @example ;;; WebScraperHelper ;;; helps a programmer ;;; scrape the ;;; Web a great deal! ;;; @end example ;; TODO: What happens to "id" attributes for SXML NF when they have to have a ;; value but they have no value in the HTML. If we set the value to "id", then ;; it's highly unlikely to be unique. Maybe this should be a feature of ;; HtmlPrag. (define (%elem-get-any-attr elem names) (if (or (not (list? elem)) (null? elem)) #f (let ((rest (cdr elem))) (cond ((not (list? rest)) #f) ((null? rest) #f) (else (let ((first-child (car rest))) (if (and (list? first-child) (not (null? first-child)) (eq? (car first-child) '@)) (let loop ((names names)) (if (null? names) #f (or (assq (car names) (cdr first-child)) (loop (cdr names))))) #f))))))) (define (%elem-content elem) (if (or (not (list? elem)) (null? elem)) '() (let loop ((lst (cdr elem))) (if (null? lst) '() (let ((head (car lst))) (if (and (list? head) (or (null? head) (memq (car head) '(@ @@)))) (loop (cdr lst)) lst)))))) (define (%step-name+num+elem step) (values (car step) (cadr step) (cddr step))) (define (%step-name+num step) (values (car step) (cadr step))) (define (%step-elem step) (cddr step)) (define (%scan-content goal path lst) (let loop-numbering ((lst lst) (steps '())) (if (null? lst) (webscraper-internal:process-steps goal path (let loop ((seen '()) (remaining steps) (result '())) (if (null? remaining) result (let ((step (car remaining))) (let ((name (car step)) (number (cadr step))) (cond ((memq name seen) (loop seen (cdr remaining) (cons step result))) ((equal? number 1) (loop seen (cdr remaining) (cons (cons name (cons #f (cddr step))) result))) (else (loop (cons name seen) (cdr remaining) (cons step result))))))))) (let ((head (car lst))) (cond ((null? head) (loop-numbering (cdr lst) steps)) ((list? head) (let ((name (car head))) (cond ((symbol? name) (loop-numbering (cdr lst) (cons (cons name (cons (let ((p (assq name steps))) (if p (+ 1 (cadr p)) 1)) head)) steps))) ((list? name) ;; TODO: This isn't quite right. (loop-numbering (append head (cdr lst)) steps)) (else (loop-numbering (cdr lst) steps))))) ;; TODO: We could match a string goal here. (else (loop-numbering (cdr lst) steps))))))) (define (webscraper-internal:process-steps goal path steps) (let loop ((steps steps)) (if (null? steps) #f (let* ((step (car steps))) (let ((elem (%step-elem step)) (new-path (cons step path))) (if (equal? elem goal) new-path (or (%scan-content goal new-path (%elem-content elem)) (loop (cdr steps))))))))) (define (%element-node? node) (and (list? node) (not (null? node)) (symbol? (car node)))) ;;; @section Interactive Interface ;;; In this version, the `interactive'' interface is a procedure intended to be ;;; invoked manually from a REPL. ;;; @defproc webscraperhelper goal sxml [ids] ;;; ;;; Displays some XPath queries yielding SXML @var{goal} from document ;;; @var{sxml}. ;;; ;;; @var{goal} is the desired SXML element node. ;;; ;;; @var{sxml} is the document in SXML First Normal Form (1NF). Some nested ;;; nodelists emitted by SXML transformation tools, such as attributes nested ;;; in extra list levels, are not permitted. ;;; ;;; The optional @var{ids} is a list of name symbols for element attributes ;;; that can be treate as unique identifiers. If @var{ids} is not given, then ;;; the default is @code{'(id)}. (Note: Since some Scheme implementations have ;;; case-insensitive readers, but SXML is case-sensitive, you may have to use ;;; @code{(list (string->symbol "foo") (string->symbol "bar"))} instead of ;;; @code{'(foo bar)}.) (define webscraperhelper (let ((default-ids (list (string->symbol "id")))) (case-lambda ((goal sxml ids) (let-values (((abs absids relids) (%wsh-sxpath-abs+absids+relids goal sxml ids))) (newline) (let loop ((lst `(("Absolute SXPath: " ,abs) ("Absolute SXPath with IDs: " ,absids) ("Relative SXPath with IDs: " ,relids))) (prior-value #f)) (and (not (null? lst)) (let ((label (caar lst)) (value (cadar lst))) (display label) (cond ((not value) (display "<None>")) ((equal? value prior-value) (display "<Same>")) (else (write value))) (newline) (loop (cdr lst) value)))) (newline))) ((goal sxml) (webscraperhelper goal sxml default-ids))))) ;;; @section Programmatic Interface ;;; The programmatic interface, such as it is, will likely change substantially ;;; in a future version, as new ways of generating queries are implemented. ;;; The following procedures are therefore exposed only for tinkering, and are ;;; not really documented. ;;; @defproc find-wsh-path goal sxml ;;; ;;; Yields a @dfn{wsh-path} to @var{goal} within @var{sxml}, or @code{#f} if no ;;; path could be found. The yielded path might share structure with ;;; @var{sxml}. (define (find-wsh-path goal sxml) (cond ((not (%element-node? goal)) (error "expected element node for goal")) ((not (%element-node? sxml)) (error "expected element node for SXML document")) (else (%scan-content goal '() sxml)))) (define (%numbered-sxpath-step name num) (if num (list name num) name)) ;;; @defproc wsh-path->sxpath-abs path ;;; @defprocx wsh-path->sxpath-absids+relids path ids ;;; @defprocx wsh-path->sxpath-abs+absids+relids path ids ;;; ;;; Translate a @dfn{wsh-path} to various SXPath queries. The yielded SXPath ;;; query lists should be considered immutable, as they might share structure ;;; with the original SXML from which @var{path} was generated, or multiple ;;; queries might share structure with each other. (define (wsh-path->sxpath-abs path) (let loop ((path path) (result '())) (if (null? path) result (loop (cdr path) (cons (let-values (((name num) (%step-name+num (car path)))) (%numbered-sxpath-step name num)) result))))) (define (wsh-path->sxpath-absids+relids path ids) (let loop ((path path) (absids '()) (relids #f)) (if (null? path) (values absids relids) (let-values (((name num elem) (%step-name+num+elem (car path)))) (let ((attr (%elem-get-any-attr elem ids))) (if attr ;; TODO: This barfs if attr is a list of one element. (let ((absids (cons `(,name (@ (equal? (,(car attr) ,(cadr attr))))) absids))) (loop (cdr path) absids (or relids (cons '// absids)))) (loop (cdr path) (cons (%numbered-sxpath-step name num) absids) relids))))))) (define (wsh-path->sxpath-abs+absids+relids path ids) (let-values (((absids relids) (wsh-path->sxpath-absids+relids path ids))) (values (wsh-path->sxpath-abs path) absids relids))) (define (%wsh-sxpath-abs+absids+relids goal sxml ids) (let ((path (find-wsh-path goal sxml))) (if path (wsh-path->sxpath-abs+absids+relids path ids) (values #f #f #f)))) ;; (define %elem-name+id ;; (let ((id-symbol (string->symbol "id"))) ;; (lambda (elem) ;; (if (or (not (list? elem)) (null? elem)) ;; (values #f #f) ;; (values (car elem) ;; (let ((rest (cdr elem))) ;; (cond ((not (list? rest)) #f) ;; ((null? rest) #f) ;; (else ;; (let ((first-child (car rest))) ;; (cond ((not (list? first-child)) #f) ;; ((null? first-child) #f) ;; ((not (eq? (car first-child) '@)) #f) ;; ((assq id-symbol (cdr first-child)) ;; => cdr) ;; (else #f))))))))))) ;; (define (%elem-name+id+content elem) ;; (if (or (not (list? elem)) (null? elem)) ;; (values #f #f '()) ;; (let ((name (car elem))) ;; (let ((rest (cdr elem))) ;; (if (or (not (list? rest)) (null? rest)) ;; (values name #f '()) ;; (let ((first-child (car rest))) ;; (if (and (list? first-child) ;; (not (null? first-child)) ;; (eq? (car first-child) '@)) ;; (values name ;; (let ((pair (assq ;; %id-symbol ;; (cdr first-child)))) ;; (if pair ;; ;; TODO: What if "cdr" isn't a list? ;; (cadr pair) ;; #f)) ;; (cdr rest)) ;; (values name #f rest)))))))) ;;; @unnumberedsec History ;;; ;;; @table @asis ;;; ;;; @item Version 0.5 --- 2009-03-14 --- PLaneT @code{(1 2)} ;;; Minor documentation change. ;;; ;;; @item Version 0.4 --- 2009-02-24 --- PLaneT @code{(1 1)} ;;; License now LGPL 3. Converted to author's new Scheme administration ;;; system. ;;; ;;; @item Version 0.3 --- 2005-07-04 --- PLaneT @code{(1 0)} ;;; Documentation update, plus get it into PLaneT 299/3xx. ;;; ;;; @item Version 0.2 --- 2004-08-16 ;;; Corrected typographical error in attributions. ;;; ;;; @item Version 0.1 --- 2004-07-31 ;;; Initial version. ;;; ;;; @end table (provide webscraperhelper)