From 5a1add373ab427a3b336981d857252e703a9f8d1 Mon Sep 17 00:00:00 2001
From: Mark H Weaver <mhw@netris.org>
Date: Wed, 9 Mar 2016 01:23:53 -0500
Subject: [PATCH] grafts: Make grafting faster.

* guix/build/graft.scm (replace-store-references): Reimplement for
faster grafting.  Use binary I/O instead of textual I/O.  Replace
'mapping' argument (an alist) with 'replacement-table' (a vhash).
(rewrite-directory): Adapt to mapping argument change in
'replace-store-references'.  Remove 'with-fluids' that previously set
'%default-port-encoding' to #f, since we now use binary I/O.
(define-inline, hash-length): New macros.
(nix-base32-char?): New variable.
---
 guix/build/graft.scm | 217 +++++++++++++++++++++++++++++++++----------
 1 file changed, 167 insertions(+), 50 deletions(-)

diff --git a/guix/build/graft.scm b/guix/build/graft.scm
index fb21fc3af3..f85d485554 100644
--- a/guix/build/graft.scm
+++ b/guix/build/graft.scm
@@ -1,5 +1,6 @@
 ;;; GNU Guix --- Functional package management for GNU
 ;;; Copyright © 2014, 2015, 2016 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2016 Mark H Weaver <mhw@netris.org>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -20,8 +21,12 @@
   #:use-module (guix build utils)
   #:use-module (rnrs bytevectors)
   #:use-module (rnrs io ports)
+  #:use-module (ice-9 vlist)
   #:use-module (ice-9 match)
   #:use-module (ice-9 threads)
+  #:use-module (ice-9 binary-ports)
+  #:use-module (srfi srfi-1)   ; list library
+  #:use-module (srfi srfi-26)  ; cut and cute
   #:export (replace-store-references
             rewrite-directory))
 
@@ -38,50 +43,134 @@
 ;;;
 ;;; Code:
 
-(define* (replace-store-references input output mapping
+(define-syntax-rule (define-inline name val)
+  (define-syntax name (identifier-syntax val)))
+
+(define-inline hash-length 32)
+
+(define nix-base32-char?
+  (cute char-set-contains?
+        ;; ASCII digits and lower case letters except e o t u
+        (string->char-set "0123456789abcdfghijklmnpqrsvwxyz")
+        <>))
+
+(define* (replace-store-references input output replacement-table
                                    #:optional (store (%store-directory)))
-  "Read data from INPUT, replacing store references according to MAPPING, and
-writing the result to OUTPUT."
-  (define pattern
-    (let ((nix-base32-chars
-           '(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9
-             #\a #\b #\c #\d #\f #\g #\h #\i #\j #\k #\l #\m #\n
-             #\p #\q #\r #\s #\v #\w #\x #\y #\z)))
-      `(,@(map char-set (string->list store))
-        ,(char-set #\/)
-        ,@(make-list 32 (list->char-set nix-base32-chars))
-        ,(char-set #\-))))
+  "Read data from INPUT, replacing store references according to
+REPLACEMENT-TABLE, and writing the result to OUTPUT.  REPLACEMENT-TABLE is a
+vhash that maps strings (original hashes) to bytevectors (replacement hashes).
+Note: We use string keys to work around the fact that guile-2.0 hashes all
+bytevectors to the same value."
 
-  ;; We cannot use `regexp-exec' here because it cannot deal with strings
-  ;; containing NUL characters, hence 'fold-port-matches'.
-  (with-fluids ((%default-port-encoding #f))
-    (when (file-port? input)
-      (setvbuf input _IOFBF 65536))
-    (when (file-port? output)
-      (setvbuf output _IOFBF 65536))
+  (define (lookup-replacement s)
+    (match (vhash-assoc s replacement-table)
+      ((origin . replacement)
+       replacement)
+      (#f #f)))
 
-    (let* ((len     (+ 34 (string-length store)))
-           (mapping (map (match-lambda
-                          ((origin . replacement)
-                           (unless (string=? (string-drop origin len)
-                                             (string-drop replacement len))
-                             (error "invalid replacement" origin replacement))
-                           (cons (string-take origin len)
-                                 (string-take replacement len))))
-                         mapping)))
-     (fold-port-matches (lambda (string result)
-                          (match (assoc-ref mapping string)
-                            (#f
-                             (put-bytevector output (string->utf8 string)))
-                            ((= string->utf8 replacement)
-                             (put-bytevector output replacement)))
-                          #t)
-                        #f
-                        pattern
-                        input
-                        (lambda (char result)     ;unmatched
-                          (put-u8 output (char->integer char))
-                          result)))))
+  (define (optimize-u8-predicate pred)
+    (cute vector-ref
+          (list->vector (map pred (iota 256)))
+          <>))
+
+  (define nix-base32-byte?
+    (optimize-u8-predicate
+     (compose nix-base32-char?
+              integer->char)))
+
+  (define (dash? byte) (= byte 45))
+
+  (define request-size (expt 2 20))  ; 1 MiB
+
+  ;; We scan the file for the following 33-byte pattern: 32 bytes of
+  ;; nix-base32 characters followed by a dash.  To accommodate large files,
+  ;; we do not read the entire file, but instead work on buffers of up to
+  ;; 'request-size' bytes.  To ensure that every 33-byte sequence appears
+  ;; entirely within exactly one buffer, adjacent buffers must overlap,
+  ;; i.e. they must share 32 byte positions.  We accomplish this by
+  ;; "ungetting" the last 32 bytes of each buffer before reading the next
+  ;; buffer, unless we know that we've reached the end-of-file.
+  (let ((buffer (make-bytevector request-size)))
+    (let loop ()
+      ;; Note: We avoid 'get-bytevector-n' to work around
+      ;; <http://bugs.gnu.org/17466>.
+      (match (get-bytevector-n! input buffer 0 request-size)
+        ((? eof-object?) 'done)
+        (end
+         ;; We scan the buffer for dashes that might be preceded by a
+         ;; nix-base32 hash.  The key optimization here is that whenever we
+         ;; find a NON-nix-base32 character at position 'i', we know that it
+         ;; cannot be part of a hash, so the earliest position where the next
+         ;; hash could start is i+1 with the following dash at position i+33.
+         ;;
+         ;; Since nix-base32 characters comprise only 1/8 of the 256 possible
+         ;; byte values, and exclude some of the most common letters in
+         ;; English text (e t o u), in practice we can advance by 33 positions
+         ;; most of the time.
+         (let scan-from ((i hash-length) (written 0))
+           ;; 'i' is the first position where we look for a dash.  'written'
+           ;; is the number of bytes in the buffer that have already been
+           ;; written.
+           (if (< i end)
+               (let ((byte (bytevector-u8-ref buffer i)))
+                 (cond ((and (dash? byte)
+                             ;; We've found a dash.  Note that we do not know
+                             ;; whether the preceeding 32 bytes are nix-base32
+                             ;; characters, but we do not need to know.  If
+                             ;; they are not, the following lookup will fail.
+                             (lookup-replacement
+                              (string-tabulate (lambda (j)
+                                                 (integer->char
+                                                  (bytevector-u8-ref buffer
+                                                   (+ j (- i hash-length)))))
+                                               hash-length)))
+                        => (lambda (replacement)
+                             ;; We've found a hash that needs to be replaced.
+                             ;; First, write out all bytes preceding the hash
+                             ;; that have not yet been written.
+                             (put-bytevector output buffer written
+                                             (- i hash-length written))
+                             ;; Now write the replacement hash.
+                             (put-bytevector output replacement)
+                             ;; Since the byte at position 'i' is a dash,
+                             ;; which is not a nix-base32 char, the earliest
+                             ;; position where the next hash might start is
+                             ;; i+1, and the earliest position where the
+                             ;; following dash might start is (+ i 1
+                             ;; hash-length).  Also, we have now written up to
+                             ;; position 'i' in the buffer.
+                             (scan-from (+ i 1 hash-length) i)))
+                       ;; If the byte at position 'i' is a nix-base32 char,
+                       ;; then the dash we're looking for might be as early as
+                       ;; the following byte, so we can only advance by 1.
+                       ((nix-base32-byte? byte)
+                        (scan-from (+ i 1) written))
+                       ;; If the byte at position 'i' is NOT a nix-base32
+                       ;; char, then the earliest position where the next hash
+                       ;; might start is i+1, with the following dash at
+                       ;; position (+ i 1 hash-length).
+                       (else
+                        (scan-from (+ i 1 hash-length) written))))
+
+               ;; We have finished scanning the buffer.  Now we determine how
+               ;; many bytes have not yet been written, and how many bytes to
+               ;; "unget".  If 'end' is less than 'request-size' then we read
+               ;; less than we asked for, which indicates that we are at EOF,
+               ;; so we needn't unget anything.  Otherwise, we unget up to
+               ;; 'hash-length' bytes (32 bytes).  However, we must be careful
+               ;; not to unget bytes that have already been written, because
+               ;; that would cause them to be written again from the next
+               ;; buffer.  In practice, this case occurs when a replacement is
+               ;; made near the end of the buffer.
+               (let* ((unwritten   (- end written))
+                      (unget-size  (if (= end request-size)
+                                       (min hash-length unwritten)
+                                       0))
+                      (write-size  (- unwritten unget-size)))
+                 (put-bytevector output buffer written write-size)
+                 (unget-bytevector input buffer (+ written write-size)
+                                   unget-size)
+                 (loop)))))))))
 
 (define (rename-matching-files directory mapping)
   "Apply MAPPING to the names of all the files in DIRECTORY, where MAPPING is
@@ -122,6 +211,35 @@ an exception is caught."
                             #:optional (store (%store-directory)))
   "Copy DIRECTORY to OUTPUT, replacing strings according to MAPPING, a list of
 file name pairs."
+
+  (define hash-mapping
+    (let* ((prefix (string-append store "/"))
+           (start  (string-length prefix))
+           (end    (+ start hash-length)))
+      (define (valid-hash? h)
+        (every nix-base32-char? (string->list h)))
+      (define (valid-suffix? s)
+        (string-prefix? "-" s))
+      (define (hash+suffix s)
+        (and (< end (string-length s))
+             (let ((hash   (substring s start end))
+                   (suffix (substring s end)))
+               (and (string-prefix? prefix s)
+                    (valid-hash?    hash)
+                    (valid-suffix?  suffix)
+                    (list hash suffix)))))
+      (map (match-lambda
+             (((= hash+suffix (origin-hash      suffix))
+               .
+               (= hash+suffix (replacement-hash suffix)))
+              (cons origin-hash (string->utf8 replacement-hash)))
+             ((origin . replacement)
+              (error "invalid replacement" origin replacement)))
+           mapping)))
+
+  (define replacement-table
+    (alist->vhash hash-mapping))
+
   (define prefix-len
     (string-length directory))
 
@@ -138,18 +256,17 @@ file name pairs."
            (symlink (call-with-output-string
                       (lambda (output)
                         (replace-store-references (open-input-string target)
-                                                  output mapping
+                                                  output replacement-table
                                                   store)))
                     dest)))
         ((regular)
-         (with-fluids ((%default-port-encoding #f))
-           (call-with-input-file file
-             (lambda (input)
-               (call-with-output-file dest
-                 (lambda (output)
-                   (replace-store-references input output mapping
-                                             store)
-                   (chmod output (stat:perms stat))))))))
+         (call-with-input-file file
+           (lambda (input)
+             (call-with-output-file dest
+               (lambda (output)
+                 (replace-store-references input output replacement-table
+                                           store)
+                 (chmod output (stat:perms stat)))))))
         ((directory)
          (mkdir-p dest))
         (else