import: cran: Parse DESCRIPTION instead of HTML.

* guix/import/cran.scm (description->alist, listify,
  beautify-description, description->package): New procedures.
(table-datum, downloads->url, nodes->text, cran-sxml->sexp): Remove
proceduces.
(latest-release): Use parsed DESCRIPTION instead of SXML.
* tests/cran.scm: Rewrite to match importer.
This commit is contained in:
Ricardo Wurmus 2015-12-03 16:12:09 +01:00
parent b6a222757b
commit 0f6b9e9828
2 changed files with 204 additions and 240 deletions

View File

@ -20,26 +20,26 @@
(define-module (guix import cran) (define-module (guix import cran)
#:use-module (ice-9 match) #:use-module (ice-9 match)
#:use-module (ice-9 regex) #:use-module (ice-9 regex)
#:use-module ((ice-9 rdelim) #:select (read-string))
#:use-module (srfi srfi-1) #:use-module (srfi srfi-1)
#:use-module (srfi srfi-26) #:use-module (srfi srfi-26)
#:use-module (sxml simple)
#:use-module (sxml match)
#:use-module (sxml xpath)
#:use-module (guix http-client) #:use-module (guix http-client)
#:use-module (guix hash) #:use-module (guix hash)
#:use-module (guix store) #:use-module (guix store)
#:use-module (guix base32) #:use-module (guix base32)
#:use-module ((guix download) #:select (download-to-store)) #:use-module ((guix download) #:select (download-to-store))
#:use-module (guix import utils) #:use-module (guix import utils)
#:use-module ((guix build-system r) #:select (cran-uri))
#:use-module (guix upstream) #:use-module (guix upstream)
#:use-module (guix packages) #:use-module (guix packages)
#:use-module (gnu packages)
#:export (cran->guix-package #:export (cran->guix-package
%cran-updater)) %cran-updater))
;;; Commentary: ;;; Commentary:
;;; ;;;
;;; Generate a package declaration template for the latest version of an R ;;; Generate a package declaration template for the latest version of an R
;;; package on CRAN, using the HTML description downloaded from ;;; package on CRAN, using the DESCRIPTION file downloaded from
;;; cran.r-project.org. ;;; cran.r-project.org.
;;; ;;;
;;; Code: ;;; Code:
@ -67,6 +67,31 @@
((lst ...) `(list ,@(map string->license lst))) ((lst ...) `(list ,@(map string->license lst)))
(_ #f))) (_ #f)))
(define (description->alist description)
"Convert a DESCRIPTION string into an alist."
(let ((lines (string-split description #\newline))
(parse (lambda (line acc)
(if (string-null? line) acc
;; Keys usually start with a capital letter and end with
;; ":". There are some exceptions, unfortunately (such
;; as "biocViews"). There are no blanks in a key.
(if (string-match "^[A-Za-z][^ :]+:( |\n|$)" line)
;; New key/value pair
(let* ((pos (string-index line #\:))
(key (string-take line pos))
(value (string-drop line (+ 1 pos))))
(cons (cons key
(string-trim-both value))
acc))
;; This is a continuation of the previous pair
(match-let ((((key . value) . rest) acc))
(cons (cons key (string-join
(list value
(string-trim-both line))))
rest)))))))
(fold parse '() lines)))
(define (format-inputs names) (define (format-inputs names)
"Generate a sorted list of package inputs from a list of package NAMES." "Generate a sorted list of package inputs from a list of package NAMES."
(map (lambda (name) (map (lambda (name)
@ -82,99 +107,69 @@ package definition."
((package-inputs ...) ((package-inputs ...)
`((,type (,'quasiquote ,(format-inputs package-inputs))))))) `((,type (,'quasiquote ,(format-inputs package-inputs)))))))
(define (table-datum tree label)
"Extract the datum node following a LABEL in the sxml table TREE. Only the
first cell of a table row is considered a label cell."
((node-pos 1)
((sxpath `(xhtml:tr
(xhtml:td 1) ; only first cell can contain label
(equal? ,label)
,(node-parent tree) ; go up to label cell
,(node-parent tree) ; go up to matching row
(xhtml:td 2))) ; select second cell
tree)))
(define %cran-url "http://cran.r-project.org/web/packages/") (define %cran-url "http://cran.r-project.org/web/packages/")
(define (cran-fetch name) (define (cran-fetch name)
"Return an sxml representation of the CRAN page for the R package NAME, "Return an alist of the contents of the DESCRIPTION file for the R package
or #f on failure. NAME is case-sensitive." NAME, or #f on failure. NAME is case-sensitive."
;; This API always returns the latest release of the module. ;; This API always returns the latest release of the module.
(let ((cran-url (string-append %cran-url name "/"))) (let ((url (string-append %cran-url name "/DESCRIPTION")))
(false-if-exception (description->alist (read-string (http-fetch url)))))
(xml->sxml (http-fetch cran-url)
#:trim-whitespace? #t
#:namespaces '((xhtml . "http://www.w3.org/1999/xhtml"))
#:default-entity-handler
(lambda (port name)
(case name
((nbsp) " ")
((ge) ">=")
((gt) ">")
((lt) "<")
(else
(format (current-warning-port)
"~a:~a:~a: undefined entitity: ~a\n"
cran-url (port-line port) (port-column port)
name)
(symbol->string name))))))))
(define (downloads->url downloads) (define (listify meta field)
"Extract from DOWNLOADS, the downloads item of the CRAN sxml tree, the "Look up FIELD in the alist META. If FIELD contains a comma-separated
download URL." string, turn it into a list and strip off parenthetic expressions. Return the
(string-append "mirror://cran/" empty list when the FIELD cannot be found."
;; Remove double dots, because we want an (let ((value (assoc-ref meta field)))
;; absolute path. (if (not value)
(regexp-substitute/global '()
#f "\\.\\./" ;; Strip off parentheses
(string-join ((sxpath '((xhtml:a 1) @ href *text*)) (let ((items (string-split (regexp-substitute/global
(table-datum downloads " Package source: "))) #f "( *\\([^\\)]+\\)) *"
'pre 'post))) value 'pre 'post)
#\,)))
;; When there is whitespace inside of items it is probably because
;; this was not an actual list to begin with.
(remove (cut string-any char-set:whitespace <>)
(map string-trim-both items))))))
(define (nodes->text nodeset) (define (beautify-description description)
"Return the concatenation of the text nodes among NODESET." "Improve the package DESCRIPTION by turning a beginning sentence fragment
(string-join ((sxpath '(// *text*)) nodeset) " ")) into a proper sentence and by using two spaces between sentences."
(let ((cleaned (if (string-prefix? "A " description)
(string-append "This package provides a"
(substring description 1))
description)))
;; Use double spacing between sentences
(regexp-substitute/global #f "\\. \\b"
cleaned 'pre ". " 'post)))
(define (cran-sxml->sexp sxml) (define (description->package meta)
"Return the `package' s-expression for a CRAN package from the SXML "Return the `package' s-expression for a CRAN package from the alist META,
representation of the package page." which was derived from the R package's DESCRIPTION file."
(define (guix-name name) (define (guix-name name)
(if (string-prefix? "r-" name) (if (string-prefix? "r-" name)
(string-downcase name) (string-downcase name)
(string-append "r-" (string-downcase name)))) (string-append "r-" (string-downcase name))))
(sxml-match-let* (let* ((name (assoc-ref meta "Package"))
(((*TOP* (xhtml:html (synopsis (assoc-ref meta "Title"))
,head (version (assoc-ref meta "Version"))
(xhtml:body (license (string->license (assoc-ref meta "License")))
(xhtml:h2 ,name-and-synopsis) ;; Some packages have multiple home pages. Some have none.
(xhtml:p ,description) (home-page (match (listify meta "URL")
,summary ((url rest ...) url)
(xhtml:h4 "Downloads:") ,downloads (_ (string-append %cran-url name))))
. ,rest))) (source-url (match (cran-uri name version)
sxml)) ((url rest ...) url)
(let* ((name (match:prefix (string-match ": " name-and-synopsis))) (_ #f)))
(synopsis (match:suffix (string-match ": " name-and-synopsis)))
(version (nodes->text (table-datum summary "Version:")))
(license ((compose string->license nodes->text)
(table-datum summary "License:")))
(home-page (nodes->text ((sxpath '((xhtml:a 1)))
(table-datum summary "URL:"))))
(source-url (downloads->url downloads))
(tarball (with-store store (download-to-store store source-url))) (tarball (with-store store (download-to-store store source-url)))
(sysdepends (map match:substring (sysdepends (map string-downcase (listify meta "SystemRequirements")))
(list-matches (propagate (map guix-name (lset-union equal?
"[^ ]+" (listify meta "Imports")
;; Strip off comma and parenthetical (listify meta "LinkingTo")
;; expressions. (delete "R"
(regexp-substitute/global (listify meta "Depends"))))))
#f "(,|\\([^\\)]+\\))"
(nodes->text (table-datum summary
"SystemRequirements:"))
'pre 'post))))
(imports (map guix-name
((sxpath '(// xhtml:a *text*))
(table-datum summary "Imports:")))))
`(package `(package
(name ,(guix-name name)) (name ,(guix-name name))
(version ,version) (version ,version)
@ -184,23 +179,22 @@ representation of the package page."
(sha256 (sha256
(base32 (base32
,(bytevector->nix-base32-string (file-sha256 tarball)))))) ,(bytevector->nix-base32-string (file-sha256 tarball))))))
(properties ,`(,'quasiquote ((,'upstream-name . ,name))))
(build-system r-build-system) (build-system r-build-system)
,@(maybe-inputs sysdepends) ,@(maybe-inputs sysdepends)
,@(maybe-inputs imports 'propagated-inputs) ,@(maybe-inputs propagate 'propagated-inputs)
(home-page ,(if (string-null? home-page) (home-page ,(if (string-null? home-page)
(string-append %cran-url name) (string-append %cran-url name)
home-page)) home-page))
(synopsis ,synopsis) (synopsis ,synopsis)
;; Use double spacing (description ,(beautify-description (assoc-ref meta "Description")))
(description ,(regexp-substitute/global #f "\\. \\b" description (license ,license))))
'pre ". " 'post))
(license ,license)))))
(define (cran->guix-package package-name) (define (cran->guix-package package-name)
"Fetch the metadata for PACKAGE-NAME from cran.r-project.org, and return the "Fetch the metadata for PACKAGE-NAME from cran.r-project.org, and return the
`package' s-expression corresponding to that package, or #f on failure." `package' s-expression corresponding to that package, or #f on failure."
(let ((module-meta (cran-fetch package-name))) (let ((module-meta (cran-fetch package-name)))
(and=> module-meta cran-sxml->sexp))) (and=> module-meta description->package)))
;;; ;;;
@ -209,32 +203,33 @@ representation of the package page."
(define (latest-release package) (define (latest-release package)
"Return an <upstream-source> for the latest release of PACKAGE." "Return an <upstream-source> for the latest release of PACKAGE."
(define name
(if (string-prefix? "r-" package)
(string-drop package 2)
package))
(define sxml (define (package->cran-name package)
(cran-fetch name)) (match (package-source package)
((? origin? origin)
(match (origin-uri origin)
((url rest ...)
(let ((end (string-rindex url #\_))
(start (string-rindex url #\/)))
;; The URL ends on
;; (string-append "/" name "_" version ".tar.gz")
(substring url start end)))
(_ #f)))
(_ #f)))
(and sxml (define cran-name
(sxml-match-let* (package->cran-name (specification->package package)))
(((*TOP* (xhtml:html
,head (define meta
(xhtml:body (cran-fetch cran-name))
(xhtml:h2 ,name-and-synopsis)
(xhtml:p ,description) (and meta
,summary (let ((version (assoc-ref meta "Version")))
(xhtml:h4 "Downloads:") ,downloads
. ,rest)))
sxml))
(let ((version (nodes->text (table-datum summary "Version:")))
(url (downloads->url downloads)))
;; CRAN does not provide signatures. ;; CRAN does not provide signatures.
(upstream-source (upstream-source
(package package) (package package)
(version version) (version version)
(urls (list url))))))) (urls (cran-uri cran-name version))))))
(define (cran-package? package) (define (cran-package? package)
"Return true if PACKAGE is an R package from CRAN." "Return true if PACKAGE is an R package from CRAN."

View File

@ -19,120 +19,84 @@
(define-module (test-cran) (define-module (test-cran)
#:use-module (guix import cran) #:use-module (guix import cran)
#:use-module (guix tests) #:use-module (guix tests)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-64) #:use-module (srfi srfi-64)
#:use-module (srfi srfi-26)
#:use-module (ice-9 match)) #:use-module (ice-9 match))
(define sxml (define description "
'(*TOP* (xhtml:html Package: My-Example
(xhtml:head Type: Package
(xhtml:title "CRAN - Package my-example-sxml")) Title: Example package
(xhtml:body Version: 1.2.3
(xhtml:h2 "my-example-sxml: Short description") Date: 2015-12-10
(xhtml:p "Long description") Author: Ricardo Wurmus
(xhtml:table Maintainer: Guix Schmeeks <guix@gnu.org>
(@ (summary "Package my-example-sxml summary")) URL: http://gnu.org/s/my-example
(xhtml:tr Description: This is a long description
(xhtml:td "Version:") spanning multiple lines: and it could confuse the parser that
(xhtml:td "1.2.3")) there is a colon : on the lines.
(xhtml:tr And: this line continues the description.
(xhtml:td "Depends:") biocViews: 0
(xhtml:td "R (>= 3.1.0)")) SystemRequirements: Cairo (>= 0)
(xhtml:tr Depends: A C++11 compiler. Version 4.6.* of g++ (as
(xhtml:td "SystemRequirements:") currently in Rtools) is insufficient; versions 4.8.*, 4.9.* or
(xhtml:td "cairo (>= 1.2 http://www.cairographics.org/)")) later will be fine.
(xhtml:tr License: GPL (>= 3)
(xhtml:td "Imports:") Imports: Rcpp (>= 0.11.5), proto, Scales
(xhtml:td LinkingTo: Rcpp, BH
(xhtml:a (@ (href "../scales/index.html")) NeedsCompilation: yes
"scales") Repository: CRAN
" (>= 0.2.3), " Date/Publication: 2015-07-14 14:15:16
(xhtml:a (@ (href "../proto/index.html")) ")
"proto")
", "
(xhtml:a (@ (href "../Rcpp/index.html")) "Rcpp")
" (>= 0.11.0)"))
(xhtml:tr
(xhtml:td "Suggests:")
(xhtml:td
(xhtml:a (@ (href "../some/index.html"))
"some")
", "
(xhtml:a (@ (href "../suggestions/index.html"))
"suggestions")))
(xhtml:tr
(xhtml:td "License:")
(xhtml:td
(xhtml:a (@ (href "../../licenses/MIT")) "MIT")))
(xhtml:tr
(xhtml:td "URL:")
(xhtml:td
(xhtml:a (@ (href "http://gnu.org/s/my-example-sxml"))
"http://gnu.org/s/my-example-sxml")
", "
(xhtml:a (@ (href "http://alternative/home/page"))
"http://alternative/home/page"))))
(xhtml:h4 "Downloads:")
(xhtml:table
(@ (summary "Package my-example-sxml downloads"))
(xhtml:tr
(xhtml:td " Reference manual: ")
(xhtml:td
(xhtml:a (@ (href "my-example-sxml.pdf"))
" my-example-sxml.pdf ")))
(xhtml:tr
(xhtml:td " Package source: ")
(xhtml:td
(xhtml:a
(@ (href "../../../src/contrib/my-example-sxml_1.2.3.tar.gz"))
" my-example-sxml_1.2.3.tar.gz "))))
(xhtml:h4 "Reverse dependencies:")
(xhtml:table
(@ (summary "Package my-example-sxml reverse dependencies"))
(xhtml:tr
(xhtml:td "Reverse depends:")
(xhtml:td "Too many."))
(xhtml:tr
(xhtml:td "Reverse imports:")
(xhtml:td "Likewise."))
(xhtml:tr
(xhtml:td "Reverse suggests:")
(xhtml:td "Uncountable.")))))))
(define simple-table (define description-alist
'(xhtml:table ((@@ (guix import cran) description->alist) description))
(xhtml:tr
(xhtml:td "Numbers") (define simple-alist
(xhtml:td "123")) '(("Key" . "Value")
(xhtml:tr ("SimpleList" . "R, Rcpp, something, whatever")
(@ (class "whatever")) ("BadList" . "This is not a real list, you know?")
(xhtml:td (@ (class "unimportant")) "Letters") ("List" . "R (>= 2.2), BH (for no reason), GenomicRanges")))
(xhtml:td "abc"))
(xhtml:tr
(xhtml:td "Letters")
(xhtml:td "xyz"))
(xhtml:tr
(xhtml:td "Single"))
(xhtml:tr
(xhtml:td "not a value")
(xhtml:td "not a label")
(xhtml:td "also not a label"))))
(test-begin "cran") (test-begin "cran")
(test-equal "table-datum: return list of first table cell matching label" (test-assert "description->alist: contains all valid keys"
'((xhtml:td "abc")) (let ((keys '("Package" "Type" "Title" "Version" "Date"
((@@ (guix import cran) table-datum) simple-table "Letters")) "Author" "Maintainer" "URL" "Description"
"SystemRequirements" "Depends" "License"
"Imports" "biocViews" "LinkingTo"
"NeedsCompilation" "Repository"
"Date/Publication")))
(lset= string=? keys (map car description-alist))))
(test-equal "table-datum: return empty list if no match" (test-equal "listify: return empty list if key cannot be found"
'() '()
((@@ (guix import cran) table-datum) simple-table "Astronauts")) ((@@ (guix import cran) listify) simple-alist "Letters"))
(test-equal "table-datum: only consider the first cell as a label cell" (test-equal "listify: split comma-separated value into elements"
'("R" "Rcpp" "something" "whatever")
((@@ (guix import cran) listify) simple-alist "SimpleList"))
(test-equal "listify: strip off parentheses"
'("R" "BH" "GenomicRanges")
((@@ (guix import cran) listify) simple-alist "List"))
(test-equal "listify: ignore values that are no lists"
'() '()
((@@ (guix import cran) table-datum) simple-table "not a label")) ((@@ (guix import cran) listify) simple-alist "BadList"))
(test-equal "beautify-description: use double spacing"
"This is a package. It is great. Trust me Mr. Hendrix."
((@@ (guix import cran) beautify-description)
"This is a package. It is great. Trust me Mr. Hendrix."))
(test-assert "cran-sxml->sexp" (test-equal "beautify-description: transform fragment into sentence"
"This package provides a function to establish world peace"
((@@ (guix import cran) beautify-description)
"A function to establish world peace"))
(test-assert "description->package"
;; Replace network resources with sample data. ;; Replace network resources with sample data.
(mock ((guix build download) url-fetch (mock ((guix build download) url-fetch
(lambda* (url file-name #:key (mirrors '())) (lambda* (url file-name #:key (mirrors '()))
@ -140,32 +104,37 @@
(lambda () (lambda ()
(display (display
(match url (match url
("mirror://cran/src/contrib/my-example-sxml_1.2.3.tar.gz" ("mirror://cran/src/contrib/My-Example_1.2.3.tar.gz"
"source") "source")
(_ (error "Unexpected URL: " url)))))))) (_ (error "Unexpected URL: " url))))))))
(match ((@@ (guix import cran) cran-sxml->sexp) sxml) (match ((@@ (guix import cran) description->package) description-alist)
(('package (('package
('name "r-my-example-sxml") ('name "r-my-example")
('version "1.2.3") ('version "1.2.3")
('source ('origin ('source ('origin
('method 'url-fetch) ('method 'url-fetch)
('uri ('cran-uri "my-example-sxml" 'version)) ('uri ('cran-uri "My-Example" 'version))
('sha256 ('sha256
('base32 ('base32
(? string? hash))))) (? string? hash)))))
('properties ('quasiquote (('upstream-name . "My-Example"))))
('build-system 'r-build-system) ('build-system 'r-build-system)
('inputs ('inputs
('quasiquote ('quasiquote
(("cairo" ('unquote 'cairo))))) (("cairo" ('unquote 'cairo)))))
('propagated-inputs ('propagated-inputs
('quasiquote ('quasiquote
(("r-proto" ('unquote 'r-proto)) (("r-bh" ('unquote 'r-bh))
("r-proto" ('unquote 'r-proto))
("r-rcpp" ('unquote 'r-rcpp)) ("r-rcpp" ('unquote 'r-rcpp))
("r-scales" ('unquote 'r-scales))))) ("r-scales" ('unquote 'r-scales)))))
('home-page "http://gnu.org/s/my-example-sxml") ('home-page "http://gnu.org/s/my-example")
('synopsis "Short description") ('synopsis "Example package")
('description "Long description") ('description
('license 'x11))) "This is a long description spanning multiple lines: \
and it could confuse the parser that there is a colon : on the \
lines. And: this line continues the description.")
('license 'gpl3+)))
(x (x
(begin (begin
(format #t "~s\n" x) (format #t "~s\n" x)