import: cran: Parse DESCRIPTION instead of HTML.

* guix/import/cran.scm (description->alist, listify,
  beautify-description, description->package): New procedures.
(table-datum, downloads->url, nodes->text, cran-sxml->sexp): Remove
proceduces.
(latest-release): Use parsed DESCRIPTION instead of SXML.
* tests/cran.scm: Rewrite to match importer.
This commit is contained in:
Ricardo Wurmus 2015-12-03 16:12:09 +01:00
parent b6a222757b
commit 0f6b9e9828
2 changed files with 204 additions and 240 deletions

View File

@ -20,26 +20,26 @@
(define-module (guix import cran)
#:use-module (ice-9 match)
#:use-module (ice-9 regex)
#:use-module ((ice-9 rdelim) #:select (read-string))
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-26)
#:use-module (sxml simple)
#:use-module (sxml match)
#:use-module (sxml xpath)
#:use-module (guix http-client)
#:use-module (guix hash)
#:use-module (guix store)
#:use-module (guix base32)
#:use-module ((guix download) #:select (download-to-store))
#:use-module (guix import utils)
#:use-module ((guix build-system r) #:select (cran-uri))
#:use-module (guix upstream)
#:use-module (guix packages)
#:use-module (gnu packages)
#:export (cran->guix-package
%cran-updater))
;;; Commentary:
;;;
;;; Generate a package declaration template for the latest version of an R
;;; package on CRAN, using the HTML description downloaded from
;;; package on CRAN, using the DESCRIPTION file downloaded from
;;; cran.r-project.org.
;;;
;;; Code:
@ -67,6 +67,31 @@
((lst ...) `(list ,@(map string->license lst)))
(_ #f)))
(define (description->alist description)
"Convert a DESCRIPTION string into an alist."
(let ((lines (string-split description #\newline))
(parse (lambda (line acc)
(if (string-null? line) acc
;; Keys usually start with a capital letter and end with
;; ":". There are some exceptions, unfortunately (such
;; as "biocViews"). There are no blanks in a key.
(if (string-match "^[A-Za-z][^ :]+:( |\n|$)" line)
;; New key/value pair
(let* ((pos (string-index line #\:))
(key (string-take line pos))
(value (string-drop line (+ 1 pos))))
(cons (cons key
(string-trim-both value))
acc))
;; This is a continuation of the previous pair
(match-let ((((key . value) . rest) acc))
(cons (cons key (string-join
(list value
(string-trim-both line))))
rest)))))))
(fold parse '() lines)))
(define (format-inputs names)
"Generate a sorted list of package inputs from a list of package NAMES."
(map (lambda (name)
@ -82,99 +107,69 @@ package definition."
((package-inputs ...)
`((,type (,'quasiquote ,(format-inputs package-inputs)))))))
(define (table-datum tree label)
"Extract the datum node following a LABEL in the sxml table TREE. Only the
first cell of a table row is considered a label cell."
((node-pos 1)
((sxpath `(xhtml:tr
(xhtml:td 1) ; only first cell can contain label
(equal? ,label)
,(node-parent tree) ; go up to label cell
,(node-parent tree) ; go up to matching row
(xhtml:td 2))) ; select second cell
tree)))
(define %cran-url "http://cran.r-project.org/web/packages/")
(define (cran-fetch name)
"Return an sxml representation of the CRAN page for the R package NAME,
or #f on failure. NAME is case-sensitive."
"Return an alist of the contents of the DESCRIPTION file for the R package
NAME, or #f on failure. NAME is case-sensitive."
;; This API always returns the latest release of the module.
(let ((cran-url (string-append %cran-url name "/")))
(false-if-exception
(xml->sxml (http-fetch cran-url)
#:trim-whitespace? #t
#:namespaces '((xhtml . "http://www.w3.org/1999/xhtml"))
#:default-entity-handler
(lambda (port name)
(case name
((nbsp) " ")
((ge) ">=")
((gt) ">")
((lt) "<")
(else
(format (current-warning-port)
"~a:~a:~a: undefined entitity: ~a\n"
cran-url (port-line port) (port-column port)
name)
(symbol->string name))))))))
(let ((url (string-append %cran-url name "/DESCRIPTION")))
(description->alist (read-string (http-fetch url)))))
(define (downloads->url downloads)
"Extract from DOWNLOADS, the downloads item of the CRAN sxml tree, the
download URL."
(string-append "mirror://cran/"
;; Remove double dots, because we want an
;; absolute path.
(regexp-substitute/global
#f "\\.\\./"
(string-join ((sxpath '((xhtml:a 1) @ href *text*))
(table-datum downloads " Package source: ")))
'pre 'post)))
(define (listify meta field)
"Look up FIELD in the alist META. If FIELD contains a comma-separated
string, turn it into a list and strip off parenthetic expressions. Return the
empty list when the FIELD cannot be found."
(let ((value (assoc-ref meta field)))
(if (not value)
'()
;; Strip off parentheses
(let ((items (string-split (regexp-substitute/global
#f "( *\\([^\\)]+\\)) *"
value 'pre 'post)
#\,)))
;; When there is whitespace inside of items it is probably because
;; this was not an actual list to begin with.
(remove (cut string-any char-set:whitespace <>)
(map string-trim-both items))))))
(define (nodes->text nodeset)
"Return the concatenation of the text nodes among NODESET."
(string-join ((sxpath '(// *text*)) nodeset) " "))
(define (beautify-description description)
"Improve the package DESCRIPTION by turning a beginning sentence fragment
into a proper sentence and by using two spaces between sentences."
(let ((cleaned (if (string-prefix? "A " description)
(string-append "This package provides a"
(substring description 1))
description)))
;; Use double spacing between sentences
(regexp-substitute/global #f "\\. \\b"
cleaned 'pre ". " 'post)))
(define (cran-sxml->sexp sxml)
"Return the `package' s-expression for a CRAN package from the SXML
representation of the package page."
(define (description->package meta)
"Return the `package' s-expression for a CRAN package from the alist META,
which was derived from the R package's DESCRIPTION file."
(define (guix-name name)
(if (string-prefix? "r-" name)
(string-downcase name)
(string-append "r-" (string-downcase name))))
(sxml-match-let*
(((*TOP* (xhtml:html
,head
(xhtml:body
(xhtml:h2 ,name-and-synopsis)
(xhtml:p ,description)
,summary
(xhtml:h4 "Downloads:") ,downloads
. ,rest)))
sxml))
(let* ((name (match:prefix (string-match ": " name-and-synopsis)))
(synopsis (match:suffix (string-match ": " name-and-synopsis)))
(version (nodes->text (table-datum summary "Version:")))
(license ((compose string->license nodes->text)
(table-datum summary "License:")))
(home-page (nodes->text ((sxpath '((xhtml:a 1)))
(table-datum summary "URL:"))))
(source-url (downloads->url downloads))
(let* ((name (assoc-ref meta "Package"))
(synopsis (assoc-ref meta "Title"))
(version (assoc-ref meta "Version"))
(license (string->license (assoc-ref meta "License")))
;; Some packages have multiple home pages. Some have none.
(home-page (match (listify meta "URL")
((url rest ...) url)
(_ (string-append %cran-url name))))
(source-url (match (cran-uri name version)
((url rest ...) url)
(_ #f)))
(tarball (with-store store (download-to-store store source-url)))
(sysdepends (map match:substring
(list-matches
"[^ ]+"
;; Strip off comma and parenthetical
;; expressions.
(regexp-substitute/global
#f "(,|\\([^\\)]+\\))"
(nodes->text (table-datum summary
"SystemRequirements:"))
'pre 'post))))
(imports (map guix-name
((sxpath '(// xhtml:a *text*))
(table-datum summary "Imports:")))))
(sysdepends (map string-downcase (listify meta "SystemRequirements")))
(propagate (map guix-name (lset-union equal?
(listify meta "Imports")
(listify meta "LinkingTo")
(delete "R"
(listify meta "Depends"))))))
`(package
(name ,(guix-name name))
(version ,version)
@ -184,23 +179,22 @@ representation of the package page."
(sha256
(base32
,(bytevector->nix-base32-string (file-sha256 tarball))))))
(properties ,`(,'quasiquote ((,'upstream-name . ,name))))
(build-system r-build-system)
,@(maybe-inputs sysdepends)
,@(maybe-inputs imports 'propagated-inputs)
,@(maybe-inputs propagate 'propagated-inputs)
(home-page ,(if (string-null? home-page)
(string-append %cran-url name)
home-page))
(synopsis ,synopsis)
;; Use double spacing
(description ,(regexp-substitute/global #f "\\. \\b" description
'pre ". " 'post))
(license ,license)))))
(description ,(beautify-description (assoc-ref meta "Description")))
(license ,license))))
(define (cran->guix-package package-name)
"Fetch the metadata for PACKAGE-NAME from cran.r-project.org, and return the
`package' s-expression corresponding to that package, or #f on failure."
(let ((module-meta (cran-fetch package-name)))
(and=> module-meta cran-sxml->sexp)))
(and=> module-meta description->package)))
;;;
@ -209,32 +203,33 @@ representation of the package page."
(define (latest-release package)
"Return an <upstream-source> for the latest release of PACKAGE."
(define name
(if (string-prefix? "r-" package)
(string-drop package 2)
package))
(define sxml
(cran-fetch name))
(define (package->cran-name package)
(match (package-source package)
((? origin? origin)
(match (origin-uri origin)
((url rest ...)
(let ((end (string-rindex url #\_))
(start (string-rindex url #\/)))
;; The URL ends on
;; (string-append "/" name "_" version ".tar.gz")
(substring url start end)))
(_ #f)))
(_ #f)))
(and sxml
(sxml-match-let*
(((*TOP* (xhtml:html
,head
(xhtml:body
(xhtml:h2 ,name-and-synopsis)
(xhtml:p ,description)
,summary
(xhtml:h4 "Downloads:") ,downloads
. ,rest)))
sxml))
(let ((version (nodes->text (table-datum summary "Version:")))
(url (downloads->url downloads)))
(define cran-name
(package->cran-name (specification->package package)))
(define meta
(cran-fetch cran-name))
(and meta
(let ((version (assoc-ref meta "Version")))
;; CRAN does not provide signatures.
(upstream-source
(package package)
(version version)
(urls (list url)))))))
(urls (cran-uri cran-name version))))))
(define (cran-package? package)
"Return true if PACKAGE is an R package from CRAN."

View File

@ -19,120 +19,84 @@
(define-module (test-cran)
#:use-module (guix import cran)
#:use-module (guix tests)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-64)
#:use-module (srfi srfi-26)
#:use-module (ice-9 match))
(define sxml
'(*TOP* (xhtml:html
(xhtml:head
(xhtml:title "CRAN - Package my-example-sxml"))
(xhtml:body
(xhtml:h2 "my-example-sxml: Short description")
(xhtml:p "Long description")
(xhtml:table
(@ (summary "Package my-example-sxml summary"))
(xhtml:tr
(xhtml:td "Version:")
(xhtml:td "1.2.3"))
(xhtml:tr
(xhtml:td "Depends:")
(xhtml:td "R (>= 3.1.0)"))
(xhtml:tr
(xhtml:td "SystemRequirements:")
(xhtml:td "cairo (>= 1.2 http://www.cairographics.org/)"))
(xhtml:tr
(xhtml:td "Imports:")
(xhtml:td
(xhtml:a (@ (href "../scales/index.html"))
"scales")
" (>= 0.2.3), "
(xhtml:a (@ (href "../proto/index.html"))
"proto")
", "
(xhtml:a (@ (href "../Rcpp/index.html")) "Rcpp")
" (>= 0.11.0)"))
(xhtml:tr
(xhtml:td "Suggests:")
(xhtml:td
(xhtml:a (@ (href "../some/index.html"))
"some")
", "
(xhtml:a (@ (href "../suggestions/index.html"))
"suggestions")))
(xhtml:tr
(xhtml:td "License:")
(xhtml:td
(xhtml:a (@ (href "../../licenses/MIT")) "MIT")))
(xhtml:tr
(xhtml:td "URL:")
(xhtml:td
(xhtml:a (@ (href "http://gnu.org/s/my-example-sxml"))
"http://gnu.org/s/my-example-sxml")
", "
(xhtml:a (@ (href "http://alternative/home/page"))
"http://alternative/home/page"))))
(xhtml:h4 "Downloads:")
(xhtml:table
(@ (summary "Package my-example-sxml downloads"))
(xhtml:tr
(xhtml:td " Reference manual: ")
(xhtml:td
(xhtml:a (@ (href "my-example-sxml.pdf"))
" my-example-sxml.pdf ")))
(xhtml:tr
(xhtml:td " Package source: ")
(xhtml:td
(xhtml:a
(@ (href "../../../src/contrib/my-example-sxml_1.2.3.tar.gz"))
" my-example-sxml_1.2.3.tar.gz "))))
(xhtml:h4 "Reverse dependencies:")
(xhtml:table
(@ (summary "Package my-example-sxml reverse dependencies"))
(xhtml:tr
(xhtml:td "Reverse depends:")
(xhtml:td "Too many."))
(xhtml:tr
(xhtml:td "Reverse imports:")
(xhtml:td "Likewise."))
(xhtml:tr
(xhtml:td "Reverse suggests:")
(xhtml:td "Uncountable.")))))))
(define description "
Package: My-Example
Type: Package
Title: Example package
Version: 1.2.3
Date: 2015-12-10
Author: Ricardo Wurmus
Maintainer: Guix Schmeeks <guix@gnu.org>
URL: http://gnu.org/s/my-example
Description: This is a long description
spanning multiple lines: and it could confuse the parser that
there is a colon : on the lines.
And: this line continues the description.
biocViews: 0
SystemRequirements: Cairo (>= 0)
Depends: A C++11 compiler. Version 4.6.* of g++ (as
currently in Rtools) is insufficient; versions 4.8.*, 4.9.* or
later will be fine.
License: GPL (>= 3)
Imports: Rcpp (>= 0.11.5), proto, Scales
LinkingTo: Rcpp, BH
NeedsCompilation: yes
Repository: CRAN
Date/Publication: 2015-07-14 14:15:16
")
(define simple-table
'(xhtml:table
(xhtml:tr
(xhtml:td "Numbers")
(xhtml:td "123"))
(xhtml:tr
(@ (class "whatever"))
(xhtml:td (@ (class "unimportant")) "Letters")
(xhtml:td "abc"))
(xhtml:tr
(xhtml:td "Letters")
(xhtml:td "xyz"))
(xhtml:tr
(xhtml:td "Single"))
(xhtml:tr
(xhtml:td "not a value")
(xhtml:td "not a label")
(xhtml:td "also not a label"))))
(define description-alist
((@@ (guix import cran) description->alist) description))
(define simple-alist
'(("Key" . "Value")
("SimpleList" . "R, Rcpp, something, whatever")
("BadList" . "This is not a real list, you know?")
("List" . "R (>= 2.2), BH (for no reason), GenomicRanges")))
(test-begin "cran")
(test-equal "table-datum: return list of first table cell matching label"
'((xhtml:td "abc"))
((@@ (guix import cran) table-datum) simple-table "Letters"))
(test-assert "description->alist: contains all valid keys"
(let ((keys '("Package" "Type" "Title" "Version" "Date"
"Author" "Maintainer" "URL" "Description"
"SystemRequirements" "Depends" "License"
"Imports" "biocViews" "LinkingTo"
"NeedsCompilation" "Repository"
"Date/Publication")))
(lset= string=? keys (map car description-alist))))
(test-equal "table-datum: return empty list if no match"
(test-equal "listify: return empty list if key cannot be found"
'()
((@@ (guix import cran) table-datum) simple-table "Astronauts"))
((@@ (guix import cran) listify) simple-alist "Letters"))
(test-equal "table-datum: only consider the first cell as a label cell"
(test-equal "listify: split comma-separated value into elements"
'("R" "Rcpp" "something" "whatever")
((@@ (guix import cran) listify) simple-alist "SimpleList"))
(test-equal "listify: strip off parentheses"
'("R" "BH" "GenomicRanges")
((@@ (guix import cran) listify) simple-alist "List"))
(test-equal "listify: ignore values that are no lists"
'()
((@@ (guix import cran) table-datum) simple-table "not a label"))
((@@ (guix import cran) listify) simple-alist "BadList"))
(test-equal "beautify-description: use double spacing"
"This is a package. It is great. Trust me Mr. Hendrix."
((@@ (guix import cran) beautify-description)
"This is a package. It is great. Trust me Mr. Hendrix."))
(test-assert "cran-sxml->sexp"
(test-equal "beautify-description: transform fragment into sentence"
"This package provides a function to establish world peace"
((@@ (guix import cran) beautify-description)
"A function to establish world peace"))
(test-assert "description->package"
;; Replace network resources with sample data.
(mock ((guix build download) url-fetch
(lambda* (url file-name #:key (mirrors '()))
@ -140,32 +104,37 @@
(lambda ()
(display
(match url
("mirror://cran/src/contrib/my-example-sxml_1.2.3.tar.gz"
("mirror://cran/src/contrib/My-Example_1.2.3.tar.gz"
"source")
(_ (error "Unexpected URL: " url))))))))
(match ((@@ (guix import cran) cran-sxml->sexp) sxml)
(match ((@@ (guix import cran) description->package) description-alist)
(('package
('name "r-my-example-sxml")
('name "r-my-example")
('version "1.2.3")
('source ('origin
('method 'url-fetch)
('uri ('cran-uri "my-example-sxml" 'version))
('uri ('cran-uri "My-Example" 'version))
('sha256
('base32
(? string? hash)))))
('properties ('quasiquote (('upstream-name . "My-Example"))))
('build-system 'r-build-system)
('inputs
('quasiquote
(("cairo" ('unquote 'cairo)))))
('propagated-inputs
('quasiquote
(("r-proto" ('unquote 'r-proto))
(("r-bh" ('unquote 'r-bh))
("r-proto" ('unquote 'r-proto))
("r-rcpp" ('unquote 'r-rcpp))
("r-scales" ('unquote 'r-scales)))))
('home-page "http://gnu.org/s/my-example-sxml")
('synopsis "Short description")
('description "Long description")
('license 'x11)))
('home-page "http://gnu.org/s/my-example")
('synopsis "Example package")
('description
"This is a long description spanning multiple lines: \
and it could confuse the parser that there is a colon : on the \
lines. And: this line continues the description.")
('license 'gpl3+)))
(x
(begin
(format #t "~s\n" x)