From 99c751fc1fc1f43e209ed10d2b2af6e32312e60d Mon Sep 17 00:00:00 2001 From: nixo Date: Sat, 20 Apr 2019 14:37:36 +0200 Subject: [PATCH] Working julia --- julia.scm | 522 ++ patches/llvm-6.0-D44650.patch | 13 + patches/llvm-6.0-DISABLE_ABI_CHECKS.patch | 39 + patches/llvm-6.0-NVPTX-addrspaces.patch | 32 + patches/llvm-6.0.0_D27296-libssp.patch | 35 + ...lvm-D27629-AArch64-large_model_6.0.1.patch | 53 + patches/llvm-D34078-vectorize-fdiv.patch | 56 + .../llvm-D42262-jumpthreading-not-i1.patch | 82 + patches/llvm-D44892-Perf-integration.patch | 677 ++ patches/llvm-D46460.patch | 26 + patches/llvm-D49832-SCEVPred.patch | 187 + patches/llvm-D50010-VNCoercion-ni.patch | 89 + patches/llvm-D50167-scev-umin.patch | 1153 ++++ patches/llvm-OProfile-line-num.patch | 48 + patches/llvm-PPC-addrspaces.patch | 29 + patches/llvm-rL323946-LSRTy.patch | 45 + patches/llvm-rL326967-aligned-load.patch | 301 + patches/llvm-rL327898.patch | 6131 +++++++++++++++++ 18 files changed, 9518 insertions(+) create mode 100644 julia.scm create mode 100644 patches/llvm-6.0-D44650.patch create mode 100644 patches/llvm-6.0-DISABLE_ABI_CHECKS.patch create mode 100644 patches/llvm-6.0-NVPTX-addrspaces.patch create mode 100644 patches/llvm-6.0.0_D27296-libssp.patch create mode 100644 patches/llvm-D27629-AArch64-large_model_6.0.1.patch create mode 100644 patches/llvm-D34078-vectorize-fdiv.patch create mode 100644 patches/llvm-D42262-jumpthreading-not-i1.patch create mode 100644 patches/llvm-D44892-Perf-integration.patch create mode 100644 patches/llvm-D46460.patch create mode 100644 patches/llvm-D49832-SCEVPred.patch create mode 100644 patches/llvm-D50010-VNCoercion-ni.patch create mode 100644 patches/llvm-D50167-scev-umin.patch create mode 100644 patches/llvm-OProfile-line-num.patch create mode 100644 patches/llvm-PPC-addrspaces.patch create mode 100644 patches/llvm-rL323946-LSRTy.patch create mode 100644 patches/llvm-rL326967-aligned-load.patch create mode 100644 patches/llvm-rL327898.patch diff --git a/julia.scm b/julia.scm new file mode 100644 index 0000000..7a91680 --- /dev/null +++ b/julia.scm @@ -0,0 +1,522 @@ +(use-modules ((guix licenses) + #:prefix license:)) +(use-modules + (guix packages)) +(use-modules + (guix download)) +(use-modules + (guix utils)) +(use-modules + (guix git-download)) +(use-modules + (guix build-system gnu)) +(use-modules + (gnu packages)) +(use-modules + (gnu packages algebra)) +(use-modules + (gnu packages base)) +(use-modules + (gnu packages compression)) +(use-modules + (gnu packages elf)) +(use-modules + (gnu packages gcc)) +(use-modules + (gnu packages llvm)) +(use-modules + (gnu packages libevent)) +(use-modules + (gnu packages libunwind)) +(use-modules + (gnu packages maths)) +(use-modules + (gnu packages multiprecision)) ; mpfr) +(use-modules + (gnu packages pcre)) +(use-modules + (gnu packages perl)) +(use-modules + (gnu packages pkg-config)) +(use-modules + (gnu packages python)) +(use-modules + (gnu packages python-xyz)) +(use-modules + (gnu packages textutils)) +(use-modules + (gnu packages tls)) +(use-modules + (gnu packages version-control)) + +(use-modules + (gnu packages wget)) + +(use-modules + (ice-9 match)) + +;; (define openblas-julia +;; (package +;; (inherit openblas) +;; (name "openblas-julia") +;; INTERFACE64= +;; )) + +;; This works *BUT* we need to apply a lot of patches. Compiling this +;; with julia applies the patches automatically + +(define llvm-julia + (package + (inherit llvm-6) + (name "llvm-julia") + (source + (origin + (method url-fetch) + (uri + (string-append + "http://releases.llvm.org/6.0.1/llvm-6.0.1.src.tar.xz")) + (sha256 + (base32 + "1qpls3vk85lydi5b4axl0809fv932qgsqgdgrk098567z4jc7mmn")) + (patches '("./llvm-6.0-D44650.patch" + "./llvm-6.0-DISABLE_ABI_CHECKS.patch" + "./llvm-6.0-NVPTX-addrspaces.patch" + "./llvm-6.0.0_D27296-libssp.patch" + "./llvm-D27629-AArch64-large_model_6.0.1.patch" + "./llvm-D34078-vectorize-fdiv.patch" + "./llvm-D42262-jumpthreading-not-i1.patch" + "./llvm-D44892-Perf-integration.patch" + "./llvm-D46460.patch" + "./llvm-D49832-SCEVPred.patch" + "./llvm-D50010-VNCoercion-ni.patch" + "./llvm-D50167-scev-umin.patch" + "./llvm-OProfile-line-num.patch" + "./llvm-PPC-addrspaces.patch" + "./llvm-rL323946-LSRTy.patch" + "./llvm-rL326967-aligned-load.patch" + "./llvm-rL327898.patch" + )) + )) + (arguments + (substitute-keyword-arguments + (package-arguments llvm-6) + ((#:configure-flags flags) + '(list ;; Taken from NixOS. Only way I could get libLLVM-6.0.so + "-DCMAKE_BUILD_TYPE=Release" + "-DLLVM_INSTALL_UTILS=ON" + "-DLLVM_BUILD_TESTS=ON" + "-DLLVM_ENABLE_FFI=ON" + "-DLLVM_ENABLE_RTTI=ON" + ;; "-DLLVM_HOST_TRIPLE=${stdenv.hostPlatform.config}" + ;; "-DLLVM_DEFAULT_TARGET_TRIPLE=${stdenv.hostPlatform.config}" + "-DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=WebAssembly" + "-DLLVM_ENABLE_DUMP=ON" + "-DLLVM_LINK_LLVM_DYLIB=ON") + ))))) + +(define libuv-julia + (let + ((commit "2348256acf5759a544e5ca7935f638d2bc091d60")) + (package + (inherit libuv) + (name "libuv-julia") + (version commit) + (source + (origin + (method url-fetch) + (uri (string-append + "https://api.github.com/repos/JuliaLang/libuv/tarball/" + commit)) + (sha256 + (base32 + "1363f4vqayfcv5zqg07qmzjff56yhad74k16c22ian45lram8mv8")))) + (build-system gnu-build-system) + (arguments + (substitute-keyword-arguments + (package-arguments libuv) + ((#:phases phases) + `(modify-phases ,phases + (delete 'autogen))))) + (home-page "https://github.com/JuliaLang/libuv")))) + +(package + (name "julia") + (version "1.1.0") + (source + (origin + (method url-fetch) + (uri + (string-append + "https://github.com/JuliaLang/julia/releases/download/v" + version "/julia-" version ".tar.gz")) + (sha256 + (base32 + "1bd6c5gqd7f2i837ay8iqi8h36smhcg0lq7f8c2axxaw8x6rcfmx")))) + (build-system gnu-build-system) + (arguments + `(#:test-target "test" + #:modules + ((ice-9 match) + (guix build gnu-build-system) + (guix build utils)) + ;; Do not strip binaries to keep support for full backtraces. + ;; See https://github.com/JuliaLang/julia/issues/17831 + #:strip-binaries? #f + + ;; The DSOs use $ORIGIN to refer to each other, but (guix build + ;; gremlin) doesn't support it yet, so skip this phase. + #:validate-runpath? #f + + #:phases + (modify-phases %standard-phases + (delete 'configure) + (add-after 'unpack 'prepare-deps + (lambda* + (#:key inputs #:allow-other-keys) + (mkdir "deps/srccache") + (copy-file + (assoc-ref inputs "dsfmt") + "deps/srccache/dsfmt-2.2.3.tar.gz") + (copy-file + (string-append + (assoc-ref inputs "virtualenv") + "/bin/virtualenv") + "julia-env") + (copy-file + (assoc-ref inputs "libwhich") + (string-append "deps/srccache/libwhich-" + "81e9723c0273d78493dc8c8ed570f68d9ce7e89e" + ".tar.gz")) + (copy-file (assoc-ref inputs "rmath") + "deps/srccache/Rmath-julia-0.1.tar.gz") + (copy-file + (assoc-ref inputs "objconv") + "deps/srccache/objconv.zip") + (copy-file + (assoc-ref inputs "suitesparse") + "deps/srccache/SuiteSparse-4.4.5.tar.gz") + ;; needed by libwhich + (setenv "LD_LIBRARY_PATH" + (string-join + (map (lambda (pkg) + (string-append (assoc-ref inputs pkg) "/lib")) + (list + "arpack-ng" "fftw" "gmp" "lapack" + "libgit2" "mpfr" "openblas" "openlibm" + "openspecfun" "pcre2" + )) + ":")) + + ;; (copy-file + ;; (assoc-ref inputs "llvm") + ;; "deps/srccache/llvm-6.0.0.src.tar.xz") + #t)) + ;; FIXME: Building the documentation requires Julia packages that + ;; would be downloaded from the Internet. We should build them in a + ;; separate build phase. + (add-after 'unpack 'disable-documentation + (lambda _ + (substitute* "Makefile" + (("(install: .*) \\$\\(BUILDROOT\\)/doc/_build/html/en/index.html" _ line) + (string-append line "\n")) + (("src ui doc deps") + "src ui deps")) + #t)) + (add-before 'check 'set-home + ;; Some tests require a home directory to be set. + (lambda _ + (setenv "HOME" "/tmp") + #t)) + (add-after 'unpack 'hardcode-soname-map + ;; ./src/runtime_ccall.cpp creates a map from library names to paths + ;; using the output of "/sbin/ldconfig -p". Since ldconfig is not + ;; used in Guix, we patch runtime_ccall.cpp to contain a static map. + (lambda* (#:key inputs #:allow-other-keys) + (use-modules (ice-9 match)) + (substitute* "src/runtime_ccall.cpp" + ;; Patch out invocations of '/sbin/ldconfig' to avoid getting + ;; error messages about missing '/sbin/ldconfig' on Guix System. + (("popen\\(.*ldconfig.*\\);") + "NULL;\n") + ;; Populate 'sonameMap'. + (("jl_read_sonames.*;") + (string-join + (map (match-lambda + ((input libname soname) + (string-append + "sonameMap[\"" libname "\"] = " + "\"" (assoc-ref inputs input) "/lib/" soname "\";"))) + '(("libc" "libc" "libc.so.6") + ("pcre2" "libpcre2-8" "libpcre2-8.so") + ("mpfr" "libmpfr" "libmpfr.so") + ("openblas" "libblas" "libopenblas.so") + ("arpack-ng" "libarpack" "libarpack.so") + ("lapack" "liblapack" "liblapack.so") + ("libgit2" "libgit2" "libgit2.so") + ("gmp" "libgmp" "libgmp.so") + ;; ("openlibm" "libopenlibm" "libopenlibm.so") + ("openspecfun" "libopenspecfun" "libopenspecfun.so") + ("fftw" "libfftw3" "libfftw3_threads.so") + ("fftwf" "libfftw3f" "libfftw3f_threads.so")))))) + ;; FIXME: NIXO + ;; (substitute* "base/fft/FFTW.jl" + ;; (("const libfftw = Base.libfftw_name") + ;; (string-append "const libfftw = \"" + ;; (assoc-ref inputs "fftw") "/lib/libfftw3_threads.so" + ;; "\"")) + ;; (("const libfftwf = Base.libfftwf_name") + ;; (string-append "const libfftwf = \"" + ;; (assoc-ref inputs "fftwf") "/lib/libfftw3f_threads.so" + ;; "\""))) + (substitute* "base/math.jl" + (("const libm = Base.libm_name") + (string-append "const libm = \"" + (assoc-ref inputs "openlibm") + "/lib/libopenlibm.so" + "\"")) + (("const openspecfun = \"libopenspecfun\"") + (string-append "const openspecfun = \"" + (assoc-ref inputs "openspecfun") + "/lib/libopenspecfun.so" + "\""))) + ;; (substitute* "base/pcre.jl" + ;; (("const PCRE_LIB = \"libpcre2-8\"") + ;; (string-append "const PCRE_LIB = \"" + ;; (assoc-ref inputs "pcre2") + ;; "/lib/libpcre2-8.so" "\""))) + #t)) + (add-before 'build 'fix-include-and-link-paths + (lambda* + (#:key inputs #:allow-other-keys) + ;; LIBUTF8PROC is a linker flag, not a build target. It is + ;; included in the LIBFILES_* variable which is used as a + ;; collection of build targets and a list of libraries to link + ;; against. + (substitute* "src/flisp/Makefile" + (("\\$\\(BUILDDIR\\)/\\$\\(EXENAME\\): \\$\\(OBJS\\) \\$\\(LIBFILES_release\\)") + "$(BUILDDIR)/$(EXENAME): $(OBJS) $(LLT_release)") + (("\\$\\(BUILDDIR\\)/\\$\\(EXENAME\\)-debug: \\$\\(DOBJS\\) \\$\\(LIBFILES_debug\\)") + "$(BUILDDIR)/$(EXENAME)-debug: $(DOBJS) $(LLT_debug)")) + ;; The REPL must be linked with libuv. + (substitute* "ui/Makefile" + (("JLDFLAGS \\+= ") + (string-append "JLDFLAGS += " + (assoc-ref %build-inputs "libuv") + "/lib/libuv.so "))) + (substitute* "base/Makefile" + (("\\$\\(build_includedir\\)/uv/errno.h") + (string-append + (assoc-ref inputs "libuv") + "/include/uv/errno.h"))) + #t)) + (add-before 'build 'replace-default-shell + (lambda _ + (substitute* "base/client.jl" + (("/bin/sh") + (which "sh"))) + #t)) + (add-after 'unpack 'hardcode-paths + (lambda _ + (substitute* "stdlib/InteractiveUtils/src/InteractiveUtils.jl" + (("`which") (string-append "`" (which "which"))) + (("`wget") (string-append "`" (which "wget")))) + #t)) + (add-before 'check 'disable-broken-tests + (lambda _ + (define (touch file-name) + (call-with-output-file file-name (const #t))) + ;; Don't know why FIXME + ;; (substitute* "stdlib/LibGit2/test/libgit2.jl" + ;; (("!LibGit2.use_http_path(cfg, github_cred)") + ;; "true") + ;; (("LibGit2.use_http_path(cfg, mygit_cred)") + ;; "true")) + (map (lambda (test) + (delete-file test) + (touch test)) + '("stdlib/Sockets/test/runtests.jl" + "stdlib/Distributed/test/runtests.jl" + "stdlib/LibGit2/test/libgit2.jl" + )) + + (substitute* "test/choosetests.jl" + ;; These tests fail, probably because some of the input + ;; binaries have been stripped and thus backtraces don't look + ;; as expected. + (("\"backtrace\",") + "") + (("\"cmdlineargs\",") + "")) + #t))) + #:make-flags + (list + (string-append "prefix=" (assoc-ref %outputs "out")) + (string-append "PREFIX=" (assoc-ref %outputs "out")) + ;; Passing the MARCH flag is necessary to build binary substitutes for + ;; the supported architectures. + ,(match + (or + (%current-target-system) + (%current-system)) + ("x86_64-linux" "MARCH=x86-64") + ("i686-linux" "MARCH=pentium4") + ("aarch64-linux" "MARCH=armv8-a") + ;; Prevent errors when querying this package on unsupported + ;; platforms, e.g. when running "guix package --search=" + (_ "MARCH=UNSUPPORTED")) + "CONFIG_SHELL=bash" ;needed to build bundled libraries + "USE_SYSTEM_DSFMT=0" ;not packaged for Guix and upstream has no + ;build system for a shared library. + "USE_SYSTEM_LAPACK=1" + "USE_SYSTEM_BLAS=1" + + ;; TODO: What about building blas with 64 support? + "USE_BLAS64=0" ;needed when USE_SYSTEM_BLAS=1 + "LIBBLAS=-lopenblas" + "LIBBLASNAME=libopenblas" + + "USE_SYSTEM_FFTW=1" + "LIBFFTWNAME=libfftw3" + "LIBFFTWFNAME=libfftw3f" + + ;; TODO: Suitesparse does not install shared libraries, so we cannot + ;; use the suitesparse package. + ;; "USE_SYSTEM_SUITESPARSE=1" + ;; (string-append "SUITESPARSE_INC=-I " + ;; (assoc-ref %build-inputs "suitesparse") + ;; "/include") + + "USE_GPL_LIBS=1" ;proudly + "USE_SYSTEM_UTF8PROC=1" + (string-append "UTF8PROC_INC=" + (assoc-ref %build-inputs "utf8proc") + "/include") + "USE_SYSTEM_LLVM=1" + "LLVM_VER=6.0.1" + + ;; "LLVM_VER=6.0.0" + "USE_LLVM_SHLIB=1" ; FIXME: fails when set to 1 + + "USE_SYSTEM_LIBUNWIND=1" + "USE_SYSTEM_LIBUV=1" + (string-append "LIBUV=" + (assoc-ref %build-inputs "libuv") + "/lib/libuv.so") + (string-append "LIBUV_INC=" + (assoc-ref %build-inputs "libuv") + "/include") + "USE_SYSTEM_PATCHELF=1" + "USE_SYSTEM_PCRE=1" + "USE_SYSTEM_OPENLIBM=1" + + "USE_SYSTEM_GMP=1" + "USE_SYSTEM_MPFR=1" + "USE_SYSTEM_ARPACK=1" + "USE_SYSTEM_LIBGIT2=1" + "USE_SYSTEM_ZLIB=1" + "USE_SYSTEM_OPENSPECFUN=1"))) + (inputs + `( ("llvm" ,llvm-julia) + ;; The bundled version is 3.3.0 so stick to that version. With other + ;; versions, we get test failures in 'linalg/arnoldi' as described in + ;; . + ("arpack-ng" ,arpack-ng-3.3.0) + ("coreutils" ,coreutils) + ;for bindings to "mkdir" and the like + ("lapack" ,lapack) + ("openblas" ,openblas) + ;Julia does not build with Atlas + ("libunwind" ,libunwind) + ("openlibm" ,openlibm) + ("openspecfun" ,openspecfun) + ("libuv", libuv-julia) + ("libgit2" ,libgit2) + ("fftw" ,fftw) + ("fftwf" ,fftwf) + ("fortran" ,gfortran) + ("pcre2" ,pcre2) + ("utf8proc" ,utf8proc) + ("mpfr" ,mpfr) + ("wget" ,wget) + ("which" ,which) + ("zlib" ,zlib) + ("gmp" ,gmp) + ("virtualenv" ,python2-virtualenv) + ;; FIXME: The following inputs are downloaded from upstream to allow us + ;; to use the lightweight Julia release tarball. Ideally, these inputs + ;; would eventually be replaced with proper Guix packages. + + ;; TODO: run "make -f contrib/repackage_system_suitesparse4.make" to copy static lib + ("rmath" + ,(origin + (method url-fetch) + (uri "https://api.github.com/repos/JuliaLang/Rmath-julia/tarball/v0.1") + (sha256 + (base32 + "1qyps217175qhid46l8f5i1v8i82slgp23ia63x2hzxwfmx8617p")))) + ("suitesparse" + ,(origin + (method url-fetch) + (uri "http://faculty.cse.tamu.edu/davis/SuiteSparse/SuiteSparse-4.4.5.tar.gz") + (sha256 + (base32 + "1jcbxb8jx5wlcixzf6n5dca2rcfx6mlcms1k2rl5gp67ay3bix43")))) + ("objconv" + ,(origin + (method url-fetch) + ;; No versioned URL, see for updates. + (uri "https://www.agner.org/optimize/objconv.zip") + (file-name "objconv-2018-10-07.zip") + (sha256 + (base32 + "0wp6ld9vk11f4nnkn56627zmlv9k5vafi99qa3yyn1pgcd61zcfs")))) + ("libwhich" + ,(origin + (method url-fetch) + (uri + (string-append + "https://api.github.com/repos/vtjnash/libwhich/tarball/" + "81e9723c0273d78493dc8c8ed570f68d9ce7e89e")) + (sha256 + (base32 + "1p7zg31kpmpbmh1znrk1xrbd074agx13b9q4dcw8n2zrwwdlbz3b")))) + ;; ("llvm" + ;; ,(origin + ;; (method url-fetch) + ;; (uri + ;; (string-append + ;; "http://releases.llvm.org/6.0.0/llvm-6.0.0.src.tar.xz")) + ;; (sha256 + ;; (base32 + ;; "0224xvfg6h40y5lrbnb9qaq3grmdc5rg00xq03s1wxjfbf8krx8z")))) + ;; ("cmake" ,cmake) ;; required to build llvm + ("dsfmt" + ,(origin + (method url-fetch) + (uri + (string-append + "http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/" + "SFMT/dSFMT-src-2.2.3.tar.gz")) + (sha256 + (base32 + "03kaqbjbi6viz0n33dk5jlf6ayxqlsq4804n7kwkndiga9s4hd42")))))) + (native-inputs + `(("openssl" ,openssl) + ("perl" ,perl) + ("patchelf" ,patchelf) + ("pkg-config" ,pkg-config) + ("python" ,python-2))) + ;; Julia is not officially released for ARM and MIPS. + ;; See https://github.com/JuliaLang/julia/issues/10639 + (supported-systems + '("i686-linux" "x86_64-linux" "aarch64-linux")) + (home-page "https://julialang.org/") + (synopsis "High-performance dynamic language for technical computing") + (description + "Julia is a high-level, high-performance dynamic programming language for +technical computing, with syntax that is familiar to users of other technical +computing environments. It provides a sophisticated compiler, distributed +parallel execution, numerical accuracy, and an extensive mathematical function +library.") + (license license:expat)) + diff --git a/patches/llvm-6.0-D44650.patch b/patches/llvm-6.0-D44650.patch new file mode 100644 index 0000000..353c823 --- /dev/null +++ b/patches/llvm-6.0-D44650.patch @@ -0,0 +1,13 @@ +Index: tools/llvm-cfi-verify/CMakeLists.txt +=================================================================== +--- a/tools/llvm-cfi-verify/CMakeLists.txt ++++ b/tools/llvm-cfi-verify/CMakeLists.txt +@@ -11,7 +11,7 @@ + Symbolize + ) + +-add_llvm_tool(llvm-cfi-verify ++add_llvm_tool(llvm-cfi-verify DISABLE_LLVM_LINK_LLVM_DYLIB + llvm-cfi-verify.cpp) + + add_subdirectory(lib) diff --git a/patches/llvm-6.0-DISABLE_ABI_CHECKS.patch b/patches/llvm-6.0-DISABLE_ABI_CHECKS.patch new file mode 100644 index 0000000..d537c25 --- /dev/null +++ b/patches/llvm-6.0-DISABLE_ABI_CHECKS.patch @@ -0,0 +1,39 @@ +From d793ba4bacae51ae25be19c1636fcf38707938fd Mon Sep 17 00:00:00 2001 +From: Valentin Churavy +Date: Fri, 1 Jun 2018 17:43:55 -0400 +Subject: [PATCH] fix LLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING + +--- + cmake/modules/HandleLLVMOptions.cmake | 2 +- + include/llvm/Config/abi-breaking.h.cmake | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake +index 3d2dd48018c..b67ee6a896e 100644 +--- a/cmake/modules/HandleLLVMOptions.cmake ++++ b/cmake/modules/HandleLLVMOptions.cmake +@@ -572,7 +572,7 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL)) + + if (LLVM_ENABLE_PEDANTIC AND LLVM_COMPILER_IS_GCC_COMPATIBLE) + append("-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) +- append("-Wno-long-long" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) ++ append("-Wno-long-long -Wundef" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) + endif() + + add_flag_if_supported("-Wcovered-switch-default" COVERED_SWITCH_DEFAULT_FLAG) +diff --git a/include/llvm/Config/abi-breaking.h.cmake b/include/llvm/Config/abi-breaking.h.cmake +index 7ae401e5b8a..d52c4609101 100644 +--- a/include/llvm/Config/abi-breaking.h.cmake ++++ b/include/llvm/Config/abi-breaking.h.cmake +@@ -20,7 +20,7 @@ + + /* Allow selectively disabling link-time mismatch checking so that header-only + ADT content from LLVM can be used without linking libSupport. */ +-#if !LLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING ++#ifndef LLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING + + // ABI_BREAKING_CHECKS protection: provides link-time failure when clients build + // mismatch with LLVM +-- +2.17.0 + diff --git a/patches/llvm-6.0-NVPTX-addrspaces.patch b/patches/llvm-6.0-NVPTX-addrspaces.patch new file mode 100644 index 0000000..d8c519e --- /dev/null +++ b/patches/llvm-6.0-NVPTX-addrspaces.patch @@ -0,0 +1,32 @@ +diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp +index f1e4251a44b..73d49f5d7e4 100644 +--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp ++++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp +@@ -1248,6 +1248,14 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, + } + } + ++bool NVPTXTargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, ++ unsigned DestAS) const { ++ assert(SrcAS != DestAS && "Expected different address spaces!"); ++ ++ return (SrcAS == ADDRESS_SPACE_GENERIC || SrcAS > ADDRESS_SPACE_LOCAL) && ++ (DestAS == ADDRESS_SPACE_GENERIC || DestAS > ADDRESS_SPACE_LOCAL); ++} ++ + SDValue + NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); +diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h +index ef04a8573d4..68a9a7195c4 100644 +--- a/lib/Target/NVPTX/NVPTXISelLowering.h ++++ b/lib/Target/NVPTX/NVPTXISelLowering.h +@@ -443,6 +443,8 @@ public: + const NVPTXSubtarget &STI); + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + ++ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; ++ + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + + const char *getTargetNodeName(unsigned Opcode) const override; diff --git a/patches/llvm-6.0.0_D27296-libssp.patch b/patches/llvm-6.0.0_D27296-libssp.patch new file mode 100644 index 0000000..dc703ad --- /dev/null +++ b/patches/llvm-6.0.0_D27296-libssp.patch @@ -0,0 +1,35 @@ +Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +=================================================================== +--- a/lib/Target/X86/X86ISelLowering.cpp ++++ b/lib/Target/X86/X86ISelLowering.cpp +@@ -2098,7 +2098,8 @@ + + void X86TargetLowering::insertSSPDeclarations(Module &M) const { + // MSVC CRT provides functionalities for stack protection. +- if (Subtarget.getTargetTriple().isOSMSVCRT()) { ++ if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || ++ Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { + // MSVC CRT has a global variable holding security cookie. + M.getOrInsertGlobal("__security_cookie", + Type::getInt8PtrTy(M.getContext())); +@@ -2120,15 +2121,19 @@ + + Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { + // MSVC CRT has a global variable holding security cookie. +- if (Subtarget.getTargetTriple().isOSMSVCRT()) ++ if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || ++ Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { + return M.getGlobalVariable("__security_cookie"); ++ } + return TargetLowering::getSDagStackGuard(M); + } + + Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { + // MSVC CRT has a function to validate security cookie. +- if (Subtarget.getTargetTriple().isOSMSVCRT()) ++ if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || ++ Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { + return M.getFunction("__security_check_cookie"); ++ } + return TargetLowering::getSSPStackGuardCheck(M); + } diff --git a/patches/llvm-D27629-AArch64-large_model_6.0.1.patch b/patches/llvm-D27629-AArch64-large_model_6.0.1.patch new file mode 100644 index 0000000..89beefd --- /dev/null +++ b/patches/llvm-D27629-AArch64-large_model_6.0.1.patch @@ -0,0 +1,53 @@ +From f76abe65e6d07fea5e838c4f8c9a9421c16debb0 Mon Sep 17 00:00:00 2001 +From: Valentin Churavy +Date: Thu, 5 Jul 2018 12:37:50 -0400 +Subject: [PATCH] Fix unwind info relocation with large code model on AArch64 + +--- + lib/MC/MCObjectFileInfo.cpp | 2 ++ + .../AArch64/ELF_ARM64_large-relocations.s | 20 +++++++++++++++++++ + 2 files changed, 22 insertions(+) + create mode 100644 test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_large-relocations.s + +diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp +index 328f000f37c..938b35f20d1 100644 +--- a/lib/MC/MCObjectFileInfo.cpp ++++ b/lib/MC/MCObjectFileInfo.cpp +@@ -291,6 +291,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) { + break; + case Triple::ppc64: + case Triple::ppc64le: ++ case Triple::aarch64: ++ case Triple::aarch64_be: + case Triple::x86_64: + FDECFIEncoding = dwarf::DW_EH_PE_pcrel | + (Large ? dwarf::DW_EH_PE_sdata8 : dwarf::DW_EH_PE_sdata4); +diff --git a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_large-relocations.s b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_large-relocations.s +new file mode 100644 +index 00000000000..66f28dabd79 +--- /dev/null ++++ b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_large-relocations.s +@@ -0,0 +1,20 @@ ++# RUN: llvm-mc -triple=arm64-none-linux-gnu -large-code-model -filetype=obj -o %T/large-reloc.o %s ++# RUN: llvm-rtdyld -triple=arm64-none-linux-gnu -verify -map-section large-reloc.o,.eh_frame=0x10000 -map-section large-reloc.o,.text=0xffff000000000000 -check=%s %T/large-reloc.o ++# RUN-BE: llvm-mc -triple=aarch64_be-none-linux-gnu -large-code-model -filetype=obj -o %T/be-large-reloc.o %s ++# RUN-BE: llvm-rtdyld -triple=aarch64_be-none-linux-gnu -verify -map-section be-large-reloc.o,.eh_frame=0x10000 -map-section be-large-reloc.o,.text=0xffff000000000000 -check=%s %T/be-large-reloc.o ++ ++ .text ++ .globl g ++ .p2align 2 ++ .type g,@function ++g: ++ .cfi_startproc ++ mov x0, xzr ++ ret ++ .Lfunc_end0: ++ .size g, .Lfunc_end0-g ++ .cfi_endproc ++ ++# Skip the CIE and load the 8 bytes PC begin pointer. ++# Assuming the CIE and the FDE length are both 4 bytes. ++# rtdyld-check: *{8}(section_addr(large-reloc.o, .eh_frame) + (*{4}(section_addr(large-reloc.o, .eh_frame))) + 0xc) = g - (section_addr(large-reloc.o, .eh_frame) + (*{4}(section_addr(large-reloc.o, .eh_frame))) + 0xc) +-- +2.18.0 + diff --git a/patches/llvm-D34078-vectorize-fdiv.patch b/patches/llvm-D34078-vectorize-fdiv.patch new file mode 100644 index 0000000..a6df7d1 --- /dev/null +++ b/patches/llvm-D34078-vectorize-fdiv.patch @@ -0,0 +1,56 @@ +From f94d12b6108b944199b715f31f25a022f75d2feb Mon Sep 17 00:00:00 2001 +From: Yichao Yu +Date: Sat, 10 Jun 2017 08:45:13 -0400 +Subject: [PATCH 4/4] Enable support for floating-point division reductions + +Similar to fsub, fdiv can also be vectorized using fmul. +--- + lib/Transforms/Utils/LoopUtils.cpp | 1 + + test/Transforms/LoopVectorize/float-reduction.ll | 22 ++++++++++++++++++++++ + 2 files changed, 23 insertions(+) + +diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp +index 3c522786641..a4aced53a95 100644 +--- a/lib/Transforms/Utils/LoopUtils.cpp ++++ b/lib/Transforms/Utils/LoopUtils.cpp +@@ -451,6 +451,7 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind, + return InstDesc(Kind == RK_IntegerOr, I); + case Instruction::Xor: + return InstDesc(Kind == RK_IntegerXor, I); ++ case Instruction::FDiv: + case Instruction::FMul: + return InstDesc(Kind == RK_FloatMult, I, UAI); + case Instruction::FSub: +diff --git a/test/Transforms/LoopVectorize/float-reduction.ll b/test/Transforms/LoopVectorize/float-reduction.ll +index f3b95d0ead7..669c54d55a2 100644 +--- a/test/Transforms/LoopVectorize/float-reduction.ll ++++ b/test/Transforms/LoopVectorize/float-reduction.ll +@@ -44,3 +44,25 @@ for.body: ; preds = %for.body, %entry + for.end: ; preds = %for.body + ret float %sub + } ++ ++;CHECK-LABEL: @foodiv( ++;CHECK: fdiv fast <4 x float> ++;CHECK: ret ++define float @foodiv(float* nocapture %A, i32* nocapture %n) nounwind uwtable readonly ssp { ++entry: ++ br label %for.body ++ ++for.body: ; preds = %for.body, %entry ++ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] ++ %sum.04 = phi float [ 1.000000e+00, %entry ], [ %sub, %for.body ] ++ %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv ++ %0 = load float, float* %arrayidx, align 4 ++ %sub = fdiv fast float %sum.04, %0 ++ %indvars.iv.next = add i64 %indvars.iv, 1 ++ %lftr.wideiv = trunc i64 %indvars.iv.next to i32 ++ %exitcond = icmp eq i32 %lftr.wideiv, 200 ++ br i1 %exitcond, label %for.end, label %for.body ++ ++for.end: ; preds = %for.body ++ ret float %sub ++} +-- +2.14.1 + diff --git a/patches/llvm-D42262-jumpthreading-not-i1.patch b/patches/llvm-D42262-jumpthreading-not-i1.patch new file mode 100644 index 0000000..4aec2cb --- /dev/null +++ b/patches/llvm-D42262-jumpthreading-not-i1.patch @@ -0,0 +1,82 @@ +commit 6a311a7a804831fea43cfb2f61322adcb407a1af +Author: Keno Fischer +Date: Thu Jan 18 15:57:05 2018 -0500 + + [JumpThreading] Don't restrict cast-traversal to i1 + + Summary: + In D17663, JumpThreading learned to look trough simple cast instructions, + but only if the source of those cast instructions was a phi/cmp i1 + (in an effort to limit compile time effects). I think this condition + is too restrictive. For switches with limited value range, InstCombine + will readily introduce an extra `trunc` instruction to a smaller + integer type (e.g. from i8 to i2), leaving us in the somewhat perverse + situation that jump-threading would work before running instcombine, + but not after. Since instcombine produces this pattern, I think we + need to consider it canonical and support it in JumpThreading. + In general, for limiting recursion, I think the existing restriction + to phi and cmp nodes should be sufficient to avoid looking through + unprofitable chains of instructions. + + Reviewers: haicheng, gberry, bmakam, mcrosier + + Subscribers: llvm-commits + + Differential Revision: https://reviews.llvm.org/D42262 + +diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp +index 95c4650..1155e18 100644 +--- a/lib/Transforms/Scalar/JumpThreading.cpp ++++ b/lib/Transforms/Scalar/JumpThreading.cpp +@@ -647,11 +647,9 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( + } + + // Handle Cast instructions. Only see through Cast when the source operand is +- // PHI or Cmp and the source type is i1 to save the compilation time. ++ // PHI or Cmp to save the compilation time. + if (CastInst *CI = dyn_cast(I)) { + Value *Source = CI->getOperand(0); +- if (!Source->getType()->isIntegerTy(1)) +- return false; + if (!isa(Source) && !isa(Source)) + return false; + ComputeValueKnownInPredecessors(Source, BB, Result, Preference, CxtI); +diff --git a/test/Transforms/JumpThreading/basic.ll b/test/Transforms/JumpThreading/basic.ll +index ce86cba..16e7549 100644 +--- a/test/Transforms/JumpThreading/basic.ll ++++ b/test/Transforms/JumpThreading/basic.ll +@@ -547,6 +547,34 @@ l5: + ; CHECK: } + } + ++define i1 @trunc_switch(i1 %arg) { ++; CHECK-LABEL: @trunc_switch ++top: ++; CHECK: br i1 %arg, label %exitA, label %exitB ++ br i1 %arg, label %common, label %B ++ ++B: ++ br label %common ++ ++common: ++ %phi = phi i8 [ 2, %B ], [ 1, %top ] ++ %trunc = trunc i8 %phi to i2 ++; CHECK-NOT: switch ++ switch i2 %trunc, label %unreach [ ++ i2 1, label %exitA ++ i2 -2, label %exitB ++ ] ++ ++unreach: ++ unreachable ++ ++exitA: ++ ret i1 true ++ ++exitB: ++ ret i1 false ++} ++ + ; CHECK-LABEL: define void @h_con(i32 %p) { + define void @h_con(i32 %p) { + %x = icmp ult i32 %p, 5 diff --git a/patches/llvm-D44892-Perf-integration.patch b/patches/llvm-D44892-Perf-integration.patch new file mode 100644 index 0000000..e849bcd --- /dev/null +++ b/patches/llvm-D44892-Perf-integration.patch @@ -0,0 +1,677 @@ +From 45bc0f0badbdbabaed7d204757c2aad7ab49a3fe Mon Sep 17 00:00:00 2001 +From: DokFaust +Date: Mon, 11 Jun 2018 12:59:42 +0200 +Subject: [PATCH] PerfJITEventListener integration, requires compile flag + LLVM_USE_PERF + +--- + CMakeLists.txt | 13 + + include/llvm/Config/config.h.cmake | 3 + + include/llvm/Config/llvm-config.h.cmake | 3 + + .../llvm/ExecutionEngine/JITEventListener.h | 9 + + lib/ExecutionEngine/CMakeLists.txt | 4 + + lib/ExecutionEngine/LLVMBuild.txt | 2 +- + lib/ExecutionEngine/Orc/LLVMBuild.txt | 2 +- + .../PerfJITEvents/CMakeLists.txt | 5 + + .../PerfJITEvents/LLVMBuild.txt | 23 + + .../PerfJITEvents/PerfJITEventListener.cpp | 492 ++++++++++++++++++ + 10 files changed, 554 insertions(+), 2 deletions(-) + create mode 100644 lib/ExecutionEngine/PerfJITEvents/CMakeLists.txt + create mode 100644 lib/ExecutionEngine/PerfJITEvents/LLVMBuild.txt + create mode 100644 lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index f8da6cf9211..fb92c825a46 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -426,6 +426,16 @@ if( LLVM_USE_OPROFILE ) + endif( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" ) + endif( LLVM_USE_OPROFILE ) + ++option(LLVM_USE_PERF ++ "Use perf JIT interface to inform perf about JIT code" OFF) ++ ++# If enabled, verify we are on a platform that supports perf. ++if( LLVM_USE_PERF ) ++ if( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" ) ++ message(FATAL_ERROR "perf support is available on Linux only.") ++ endif( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" ) ++endif( LLVM_USE_PERF ) ++ + set(LLVM_USE_SANITIZER "" CACHE STRING + "Define the sanitizer used to build binaries and tests.") + set(LLVM_LIB_FUZZING_ENGINE "" CACHE PATH +@@ -634,6 +644,9 @@ endif (LLVM_USE_INTEL_JITEVENTS) + if (LLVM_USE_OPROFILE) + set(LLVMOPTIONALCOMPONENTS ${LLVMOPTIONALCOMPONENTS} OProfileJIT) + endif (LLVM_USE_OPROFILE) ++if (LLVM_USE_PERF) ++ set(LLVMOPTIONALCOMPONENTS ${LLVMOPTIONALCOMPONENTS} PerfJITEvents) ++endif (LLVM_USE_PERF) + + message(STATUS "Constructing LLVMBuild project information") + execute_process( +diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake +index 940f8420304..17787ed779b 100644 +--- a/include/llvm/Config/config.h.cmake ++++ b/include/llvm/Config/config.h.cmake +@@ -377,6 +377,9 @@ + /* Define if we have the oprofile JIT-support library */ + #cmakedefine01 LLVM_USE_OPROFILE + ++/* Define if we have the perf JIT-support library */ ++#cmakedefine01 LLVM_USE_PERF ++ + /* LLVM version information */ + #cmakedefine LLVM_VERSION_INFO "${LLVM_VERSION_INFO}" + +diff --git a/include/llvm/Config/llvm-config.h.cmake b/include/llvm/Config/llvm-config.h.cmake +index 4daa00f3bc4..8d9c3b24d52 100644 +--- a/include/llvm/Config/llvm-config.h.cmake ++++ b/include/llvm/Config/llvm-config.h.cmake +@@ -65,6 +65,9 @@ + /* Define if we have the oprofile JIT-support library */ + #cmakedefine01 LLVM_USE_OPROFILE + ++/* Define if we have the perf JIT-support library */ ++#cmakedefine01 LLVM_USE_PERF ++ + /* Major version of the LLVM API */ + #define LLVM_VERSION_MAJOR ${LLVM_VERSION_MAJOR} + +diff --git a/include/llvm/ExecutionEngine/JITEventListener.h b/include/llvm/ExecutionEngine/JITEventListener.h +index ff7840f00a4..1cc2c423a8b 100644 +--- a/include/llvm/ExecutionEngine/JITEventListener.h ++++ b/include/llvm/ExecutionEngine/JITEventListener.h +@@ -115,6 +115,15 @@ public: + } + #endif // USE_OPROFILE + ++#ifdef LLVM_USE_PERF ++ static JITEventListener *createPerfJITEventListener(); ++#else ++ static JITEventListener *createPerfJITEventListener() ++ { ++ return nullptr; ++ } ++#endif //USE_PERF ++ + private: + virtual void anchor(); + }; +diff --git a/lib/ExecutionEngine/CMakeLists.txt b/lib/ExecutionEngine/CMakeLists.txt +index 84b34919e44..893d113a685 100644 +--- a/lib/ExecutionEngine/CMakeLists.txt ++++ b/lib/ExecutionEngine/CMakeLists.txt +@@ -30,3 +30,7 @@ endif( LLVM_USE_OPROFILE ) + if( LLVM_USE_INTEL_JITEVENTS ) + add_subdirectory(IntelJITEvents) + endif( LLVM_USE_INTEL_JITEVENTS ) ++ ++if( LLVM_USE_PERF ) ++ add_subdirectory(PerfJITEvents) ++endif( LLVM_USE_PERF ) +diff --git a/lib/ExecutionEngine/LLVMBuild.txt b/lib/ExecutionEngine/LLVMBuild.txt +index 9d29a41f504..b6e1bda6a51 100644 +--- a/lib/ExecutionEngine/LLVMBuild.txt ++++ b/lib/ExecutionEngine/LLVMBuild.txt +@@ -16,7 +16,7 @@ + ;===------------------------------------------------------------------------===; + + [common] +-subdirectories = Interpreter MCJIT RuntimeDyld IntelJITEvents OProfileJIT Orc ++subdirectories = Interpreter MCJIT RuntimeDyld IntelJITEvents OProfileJIT Orc PerfJITEvents + + [component_0] + type = Library +diff --git a/lib/ExecutionEngine/Orc/LLVMBuild.txt b/lib/ExecutionEngine/Orc/LLVMBuild.txt +index 8f05172e77a..ef4ae64e823 100644 +--- a/lib/ExecutionEngine/Orc/LLVMBuild.txt ++++ b/lib/ExecutionEngine/Orc/LLVMBuild.txt +@@ -19,4 +19,4 @@ + type = Library + name = OrcJIT + parent = ExecutionEngine +-required_libraries = Core ExecutionEngine Object RuntimeDyld Support TransformUtils ++required_libraries = Core ExecutionEngine Object RuntimeDyld Support TransformUtils +diff --git a/lib/ExecutionEngine/PerfJITEvents/CMakeLists.txt b/lib/ExecutionEngine/PerfJITEvents/CMakeLists.txt +new file mode 100644 +index 00000000000..136cc429d02 +--- /dev/null ++++ b/lib/ExecutionEngine/PerfJITEvents/CMakeLists.txt +@@ -0,0 +1,5 @@ ++add_llvm_library(LLVMPerfJITEvents ++ PerfJITEventListener.cpp ++ ) ++ ++add_dependencies(LLVMPerfJITEvents LLVMCodeGen) +diff --git a/lib/ExecutionEngine/PerfJITEvents/LLVMBuild.txt b/lib/ExecutionEngine/PerfJITEvents/LLVMBuild.txt +new file mode 100644 +index 00000000000..b1958a69260 +--- /dev/null ++++ b/lib/ExecutionEngine/PerfJITEvents/LLVMBuild.txt +@@ -0,0 +1,23 @@ ++;===- ./lib/ExecutionEngine/PerfJITEvents/LLVMBuild.txt ----------------*- Conf -*--===; ++; ++; The LLVM Compiler Infrastructure ++; ++; This file is distributed under the University of Illinois Open Source ++; License. See LICENSE.TXT for details. ++; ++;===------------------------------------------------------------------------===; ++; ++; This is an LLVMBuild description file for the components in this subdirectory. ++; ++; For more information on the LLVMBuild system, please see: ++; ++; http://llvm.org/docs/LLVMBuild.html ++; ++;===------------------------------------------------------------------------===; ++ ++[component_0] ++type = OptionalLibrary ++name = PerfJITEvents ++parent = ExecutionEngine ++required_libraries = CodeGen Core DebugInfoDWARF ExecutionEngine Object Support TransformUtils ++ +diff --git a/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp b/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp +new file mode 100644 +index 00000000000..c2b97dd59f3 +--- /dev/null ++++ b/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp +@@ -0,0 +1,492 @@ ++//===-- PerfJITEventListener.cpp - Tell Linux's perf about JITted code ----===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// This file defines a JITEventListener object that tells perf about JITted ++// functions, including source line information. ++// ++// Documentation for perf jit integration is available at: ++// https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/tools/perf/Documentation/jitdump-specification.txt ++// https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/tools/perf/Documentation/jit-interface.txt ++// ++//===----------------------------------------------------------------------===// ++ ++#include "llvm/ADT/Twine.h" ++#include "llvm/Config/config.h" ++#include "llvm/DebugInfo/DWARF/DWARFContext.h" ++#include "llvm/ExecutionEngine/JITEventListener.h" ++#include "llvm/Object/ObjectFile.h" ++#include "llvm/Object/SymbolSize.h" ++#include "llvm/Support/Debug.h" ++#include "llvm/Support/Errno.h" ++#include "llvm/Support/FileSystem.h" ++#include "llvm/Support/MemoryBuffer.h" ++#include "llvm/Support/Mutex.h" ++#include "llvm/Support/MutexGuard.h" ++#include "llvm/Support/Path.h" ++#include "llvm/Support/Process.h" ++#include "llvm/Support/Threading.h" ++#include "llvm/Support/raw_ostream.h" ++ ++#include // mmap() ++#include // getpid() ++#include // clock_gettime(), time(), localtime_r() */ ++#include // for getpid(), read(), close() ++ ++using namespace llvm; ++using namespace llvm::object; ++typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind; ++ ++namespace { ++ ++// language identifier (XXX: should we generate something better from debug ++// info?) ++#define JIT_LANG "llvm-IR" ++#define LLVM_PERF_JIT_MAGIC \ ++ ((uint32_t)'J' << 24 | (uint32_t)'i' << 16 | (uint32_t)'T' << 8 | \ ++ (uint32_t)'D') ++#define LLVM_PERF_JIT_VERSION 1 ++ ++// bit 0: set if the jitdump file is using an architecture-specific timestamp ++// clock source ++#define JITDUMP_FLAGS_ARCH_TIMESTAMP (1ULL << 0) ++ ++struct LLVMPerfJitHeader; ++ ++class PerfJITEventListener : public JITEventListener { ++public: ++ PerfJITEventListener(); ++ ~PerfJITEventListener() { ++ if (MarkerAddr) ++ CloseMarker(); ++ } ++ ++ void NotifyObjectEmitted(const ObjectFile &Obj, ++ const RuntimeDyld::LoadedObjectInfo &L) override; ++ void NotifyFreeingObject(const ObjectFile &Obj) override; ++ ++private: ++ bool InitDebuggingDir(); ++ bool OpenMarker(); ++ void CloseMarker(); ++ static bool FillMachine(LLVMPerfJitHeader &hdr); ++ ++ void NotifyCode(Expected &Symbol, uint64_t CodeAddr, ++ uint64_t CodeSize); ++ void NotifyDebug(uint64_t CodeAddr, DILineInfoTable Lines); ++ ++ // cache lookups ++ pid_t Pid; ++ ++ // base directory for output data ++ std::string JitPath; ++ ++ // output data stream, closed via Dumpstream ++ int DumpFd = -1; ++ ++ // output data stream ++ std::unique_ptr Dumpstream; ++ ++ // prevent concurrent dumps from messing up the output file ++ sys::Mutex Mutex; ++ ++ // perf mmap marker ++ void *MarkerAddr = NULL; ++ ++ // perf support ready ++ bool SuccessfullyInitialized = false; ++ ++ // identifier for functions, primarily to identify when moving them around ++ uint64_t CodeGeneration = 1; ++}; ++ ++// The following are POD struct definitions from the perf jit specification ++ ++enum LLVMPerfJitRecordType { ++ JIT_CODE_LOAD = 0, ++ JIT_CODE_MOVE = 1, // not emitted, code isn't moved ++ JIT_CODE_DEBUG_INFO = 2, ++ JIT_CODE_CLOSE = 3, // not emitted, unnecessary ++ JIT_CODE_UNWINDING_INFO = 4, // not emitted ++ ++ JIT_CODE_MAX ++}; ++ ++struct LLVMPerfJitHeader { ++ uint32_t Magic; // characters "JiTD" ++ uint32_t Version; // header version ++ uint32_t TotalSize; // total size of header ++ uint32_t ElfMach; // elf mach target ++ uint32_t Pad1; // reserved ++ uint32_t Pid; ++ uint64_t Timestamp; // timestamp ++ uint64_t Flags; // flags ++}; ++ ++// record prefix (mandatory in each record) ++struct LLVMPerfJitRecordPrefix { ++ uint32_t Id; // record type identifier ++ uint32_t TotalSize; ++ uint64_t Timestamp; ++}; ++ ++struct LLVMPerfJitRecordCodeLoad { ++ LLVMPerfJitRecordPrefix Prefix; ++ ++ uint32_t Pid; ++ uint32_t Tid; ++ uint64_t Vma; ++ uint64_t CodeAddr; ++ uint64_t CodeSize; ++ uint64_t CodeIndex; ++}; ++ ++struct LLVMPerfJitDebugEntry { ++ uint64_t Addr; ++ int Lineno; // source line number starting at 1 ++ int Discrim; // column discriminator, 0 is default ++ // followed by null terminated filename, \xff\0 if same as previous entry ++}; ++ ++struct LLVMPerfJitRecordDebugInfo { ++ LLVMPerfJitRecordPrefix Prefix; ++ ++ uint64_t CodeAddr; ++ uint64_t NrEntry; ++ // followed by NrEntry LLVMPerfJitDebugEntry records ++}; ++ ++static inline uint64_t timespec_to_ns(const struct timespec *ts) { ++ const uint64_t NanoSecPerSec = 1000000000; ++ return ((uint64_t)ts->tv_sec * NanoSecPerSec) + ts->tv_nsec; ++} ++ ++static inline uint64_t perf_get_timestamp(void) { ++ struct timespec ts; ++ int ret; ++ ++ ret = clock_gettime(CLOCK_MONOTONIC, &ts); ++ if (ret) ++ return 0; ++ ++ return timespec_to_ns(&ts); ++} ++ ++PerfJITEventListener::PerfJITEventListener() : Pid(::getpid()) { ++ // check if clock-source is supported ++ if (!perf_get_timestamp()) { ++ errs() << "kernel does not support CLOCK_MONOTONIC\n"; ++ return; ++ } ++ ++ if (!InitDebuggingDir()) { ++ errs() << "could not initialize debugging directory\n"; ++ return; ++ } ++ ++ std::string Filename; ++ raw_string_ostream FilenameBuf(Filename); ++ FilenameBuf << JitPath << "/jit-" << Pid << ".dump"; ++ ++ // Need to open ourselves, because we need to hand the FD to OpenMarker() and ++ // raw_fd_ostream doesn't expose the FD. ++ using sys::fs::openFileForWrite; ++ if (auto EC = ++ openFileForWrite(FilenameBuf.str(), DumpFd, sys::fs::F_RW, 0666)) { ++ errs() << "could not open JIT dump file " << FilenameBuf.str() << ": " ++ << EC.message() << "\n"; ++ return; ++ } ++ ++ Dumpstream = make_unique(DumpFd, true); ++ ++ LLVMPerfJitHeader Header = {0}; ++ if (!FillMachine(Header)) ++ return; ++ ++ // signal this process emits JIT information ++ if (!OpenMarker()) ++ return; ++ ++ // emit dumpstream header ++ Header.Magic = LLVM_PERF_JIT_MAGIC; ++ Header.Version = LLVM_PERF_JIT_VERSION; ++ Header.TotalSize = sizeof(Header); ++ Header.Pid = Pid; ++ Header.Timestamp = perf_get_timestamp(); ++ Dumpstream->write(reinterpret_cast(&Header), sizeof(Header)); ++ ++ // Everything initialized, can do profiling now. ++ if (!Dumpstream->has_error()) ++ SuccessfullyInitialized = true; ++} ++ ++void PerfJITEventListener::NotifyObjectEmitted( ++ const ObjectFile &Obj, const RuntimeDyld::LoadedObjectInfo &L) { ++ ++ if (!SuccessfullyInitialized) ++ return; ++ ++ OwningBinary DebugObjOwner = L.getObjectForDebug(Obj); ++ const ObjectFile &DebugObj = *DebugObjOwner.getBinary(); ++ ++ // Get the address of the object image for use as a unique identifier ++ std::unique_ptr Context = DWARFContext::create(DebugObj); ++ ++ // Use symbol info to iterate over functions in the object. ++ for (const std::pair &P : computeSymbolSizes(DebugObj)) { ++ SymbolRef Sym = P.first; ++ std::string SourceFileName; ++ ++ Expected SymTypeOrErr = Sym.getType(); ++ if (!SymTypeOrErr) { ++ // There's not much we can with errors here ++ consumeError(SymTypeOrErr.takeError()); ++ continue; ++ } ++ SymbolRef::Type SymType = *SymTypeOrErr; ++ if (SymType != SymbolRef::ST_Function) ++ continue; ++ ++ Expected Name = Sym.getName(); ++ if (!Name) { ++ consumeError(Name.takeError()); ++ continue; ++ } ++ ++ Expected AddrOrErr = Sym.getAddress(); ++ if (!AddrOrErr) { ++ consumeError(AddrOrErr.takeError()); ++ continue; ++ } ++ uint64_t Addr = *AddrOrErr; ++ uint64_t Size = P.second; ++ ++ // According to spec debugging info has to come before loading the ++ // corresonding code load. ++ DILineInfoTable Lines = Context->getLineInfoForAddressRange( ++ Addr, Size, FileLineInfoKind::AbsoluteFilePath); ++ ++ NotifyDebug(Addr, Lines); ++ NotifyCode(Name, Addr, Size); ++ } ++ ++ Dumpstream->flush(); ++} ++ ++void PerfJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) { ++ // perf currently doesn't have an interface for unloading. But munmap()ing the ++ // code section does, so that's ok. ++} ++ ++bool PerfJITEventListener::InitDebuggingDir() { ++ time_t Time; ++ struct tm LocalTime; ++ char TimeBuffer[sizeof("YYYYMMDD")]; ++ SmallString<64> Path; ++ ++ // search for location to dump data to ++ if (const char *BaseDir = getenv("JITDUMPDIR")) ++ Path.append(BaseDir); ++ else if (!sys::path::home_directory(Path)) ++ Path = "."; ++ ++ // create debug directory ++ Path += "/.debug/jit/"; ++ if (auto EC = sys::fs::create_directories(Path)) { ++ errs() << "could not create jit cache directory " << Path << ": " ++ << EC.message() << "\n"; ++ return false; ++ } ++ ++ // create unique directory for dump data related to this process ++ time(&Time); ++ localtime_r(&Time, &LocalTime); ++ strftime(TimeBuffer, sizeof(TimeBuffer), "%Y%m%d", &LocalTime); ++ Path += JIT_LANG "-jit-"; ++ Path += TimeBuffer; ++ ++ SmallString<128> UniqueDebugDir; ++ ++ using sys::fs::createUniqueDirectory; ++ if (auto EC = createUniqueDirectory(Path, UniqueDebugDir)) { ++ errs() << "could not create unique jit cache directory " << UniqueDebugDir ++ << ": " << EC.message() << "\n"; ++ return false; ++ } ++ ++ JitPath = UniqueDebugDir.str(); ++ ++ return true; ++} ++ ++bool PerfJITEventListener::OpenMarker() { ++ // We mmap the jitdump to create an MMAP RECORD in perf.data file. The mmap ++ // is captured either live (perf record running when we mmap) or in deferred ++ // mode, via /proc/PID/maps. The MMAP record is used as a marker of a jitdump ++ // file for more meta data info about the jitted code. Perf report/annotate ++ // detect this special filename and process the jitdump file. ++ // ++ // Mapping must be PROT_EXEC to ensure it is captured by perf record ++ // even when not using -d option. ++ MarkerAddr = ::mmap(NULL, sys::Process::getPageSize(), PROT_READ | PROT_EXEC, ++ MAP_PRIVATE, DumpFd, 0); ++ ++ if (MarkerAddr == MAP_FAILED) { ++ errs() << "could not mmap JIT marker\n"; ++ return false; ++ } ++ return true; ++} ++ ++void PerfJITEventListener::CloseMarker() { ++ if (!MarkerAddr) ++ return; ++ ++ munmap(MarkerAddr, sys::Process::getPageSize()); ++ MarkerAddr = nullptr; ++} ++ ++bool PerfJITEventListener::FillMachine(LLVMPerfJitHeader &hdr) { ++ char id[16]; ++ struct { ++ uint16_t e_type; ++ uint16_t e_machine; ++ } info; ++ ++ size_t RequiredMemory = sizeof(id) + sizeof(info); ++ ++ ErrorOr> MB = ++ MemoryBuffer::getFileSlice("/proc/self/exe", ++ RequiredMemory, ++ 0); ++ ++ // This'll not guarantee that enough data was actually read from the ++ // underlying file. Instead the trailing part of the buffer would be ++ // zeroed. Given the ELF signature check below that seems ok though, ++ // it's unlikely that the file ends just after that, and the ++ // consequence would just be that perf wouldn't recognize the ++ // signature. ++ if (auto EC = MB.getError()) { ++ errs() << "could not open /proc/self/exe: " << EC.message() << "\n"; ++ return false; ++ } ++ ++ memcpy(&id, (*MB)->getBufferStart(), sizeof(id)); ++ memcpy(&info, (*MB)->getBufferStart() + sizeof(id), sizeof(info)); ++ ++ // check ELF signature ++ if (id[0] != 0x7f || id[1] != 'E' || id[2] != 'L' || id[3] != 'F') { ++ errs() << "invalid elf signature\n"; ++ return false; ++ } ++ ++ hdr.ElfMach = info.e_machine; ++ ++ return true; ++} ++ ++void PerfJITEventListener::NotifyCode(Expected &Symbol, ++ uint64_t CodeAddr, uint64_t CodeSize) { ++ assert(SuccessfullyInitialized); ++ ++ // 0 length functions can't have samples. ++ if (CodeSize == 0) ++ return; ++ ++ LLVMPerfJitRecordCodeLoad rec; ++ rec.Prefix.Id = JIT_CODE_LOAD; ++ rec.Prefix.TotalSize = sizeof(rec) + // debug record itself ++ Symbol->size() + 1 + // symbol name ++ CodeSize; // and code ++ rec.Prefix.Timestamp = perf_get_timestamp(); ++ ++ rec.CodeSize = CodeSize; ++ rec.Vma = 0; ++ rec.CodeAddr = CodeAddr; ++ rec.Pid = Pid; ++ rec.Tid = get_threadid(); ++ ++ // avoid interspersing output ++ MutexGuard Guard(Mutex); ++ ++ rec.CodeIndex = CodeGeneration++; // under lock! ++ ++ Dumpstream->write(reinterpret_cast(&rec), sizeof(rec)); ++ Dumpstream->write(Symbol->data(), Symbol->size() + 1); ++ Dumpstream->write(reinterpret_cast(CodeAddr), CodeSize); ++} ++ ++void PerfJITEventListener::NotifyDebug(uint64_t CodeAddr, ++ DILineInfoTable Lines) { ++ assert(SuccessfullyInitialized); ++ ++ // Didn't get useful debug info. ++ if (Lines.empty()) ++ return; ++ ++ LLVMPerfJitRecordDebugInfo rec; ++ rec.Prefix.Id = JIT_CODE_DEBUG_INFO; ++ rec.Prefix.TotalSize = sizeof(rec); // will be increased further ++ rec.Prefix.Timestamp = perf_get_timestamp(); ++ rec.CodeAddr = CodeAddr; ++ rec.NrEntry = Lines.size(); ++ ++ // compute total size size of record (variable due to filenames) ++ DILineInfoTable::iterator Begin = Lines.begin(); ++ DILineInfoTable::iterator End = Lines.end(); ++ for (DILineInfoTable::iterator It = Begin; It != End; ++It) { ++ DILineInfo &line = It->second; ++ rec.Prefix.TotalSize += sizeof(LLVMPerfJitDebugEntry); ++ rec.Prefix.TotalSize += line.FileName.size() + 1; ++ } ++ ++ // The debug_entry describes the source line information. It is defined as ++ // follows in order: ++ // * uint64_t code_addr: address of function for which the debug information ++ // is generated ++ // * uint32_t line : source file line number (starting at 1) ++ // * uint32_t discrim : column discriminator, 0 is default ++ // * char name[n] : source file name in ASCII, including null termination ++ ++ // avoid interspersing output ++ MutexGuard Guard(Mutex); ++ ++ Dumpstream->write(reinterpret_cast(&rec), sizeof(rec)); ++ ++ for (DILineInfoTable::iterator It = Begin; It != End; ++It) { ++ LLVMPerfJitDebugEntry LineInfo; ++ DILineInfo &Line = It->second; ++ ++ LineInfo.Addr = It->first; ++ // The function re-created by perf is preceded by a elf ++ // header. Need to adjust for that, otherwise the results are ++ // wrong. ++ LineInfo.Addr += 0x40; ++ LineInfo.Lineno = Line.Line; ++ LineInfo.Discrim = Line.Discriminator; ++ ++ Dumpstream->write(reinterpret_cast(&LineInfo), ++ sizeof(LineInfo)); ++ Dumpstream->write(Line.FileName.c_str(), Line.FileName.size() + 1); ++ } ++} ++ ++// There should be only a single event listener per process, otherwise perf gets ++// confused. ++llvm::ManagedStatic PerfListener; ++ ++} // end anonymous namespace ++ ++namespace llvm { ++JITEventListener *JITEventListener::createPerfJITEventListener() { ++ return &*PerfListener; ++} ++ ++} // namespace llvm ++ +-- +2.17.1 + diff --git a/patches/llvm-D46460.patch b/patches/llvm-D46460.patch new file mode 100644 index 0000000..ec0a823 --- /dev/null +++ b/patches/llvm-D46460.patch @@ -0,0 +1,26 @@ +Index: lib/Analysis/LoopInfo.cpp +=================================================================== +--- a/lib/Analysis/LoopInfo.cpp ++++ b/lib/Analysis/LoopInfo.cpp +@@ -223,15 +223,14 @@ + BasicBlock *H = getHeader(); + for (BasicBlock *BB : this->blocks()) { + TerminatorInst *TI = BB->getTerminator(); +- MDNode *MD = nullptr; + + // Check if this terminator branches to the loop header. +- for (BasicBlock *Successor : TI->successors()) { +- if (Successor == H) { +- MD = TI->getMetadata(LLVMContext::MD_loop); +- break; +- } +- } ++ bool IsPredecessor = any_of(TI->successors(), ++ [=](BasicBlock *Successor) { return Successor == H; }); ++ if (!IsPredecessor) ++ continue; ++ ++ MDNode *MD = TI->getMetadata(LLVMContext::MD_loop); + if (!MD) + return nullptr; + diff --git a/patches/llvm-D49832-SCEVPred.patch b/patches/llvm-D49832-SCEVPred.patch new file mode 100644 index 0000000..47be214 --- /dev/null +++ b/patches/llvm-D49832-SCEVPred.patch @@ -0,0 +1,187 @@ +commit 98592fcc61307968f7df1362771534595a1e1c21 +Author: Keno Fischer +Date: Wed Jul 25 19:29:02 2018 -0400 + + [SCEV] Don't expand Wrap predicate using inttoptr in ni addrspaces + + Summary: + In non-integral address spaces, we're not allowed to introduce inttoptr/ptrtoint + intrinsics. Instead, we need to expand any pointer arithmetic as geps on the + base pointer. Luckily this is a common task for SCEV, so all we have to do here + is hook up the corresponding helper function and add test case. + + Fixes PR38290 + + Reviewers: reames, sanjoy + + Subscribers: javed.absar, llvm-commits + + Differential Revision: https://reviews.llvm.org/D49832 + +diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp +index 7f76f057216..f441a3647fb 100644 +--- a/lib/Analysis/ScalarEvolutionExpander.cpp ++++ b/lib/Analysis/ScalarEvolutionExpander.cpp +@@ -2157,8 +2157,9 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, + const SCEV *Step = AR->getStepRecurrence(SE); + const SCEV *Start = AR->getStart(); + ++ Type *ARTy = AR->getType(); + unsigned SrcBits = SE.getTypeSizeInBits(ExitCount->getType()); +- unsigned DstBits = SE.getTypeSizeInBits(AR->getType()); ++ unsigned DstBits = SE.getTypeSizeInBits(ARTy); + + // The expression {Start,+,Step} has nusw/nssw if + // Step < 0, Start - |Step| * Backedge <= Start +@@ -2170,11 +2171,12 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, + Value *TripCountVal = expandCodeFor(ExitCount, CountTy, Loc); + + IntegerType *Ty = +- IntegerType::get(Loc->getContext(), SE.getTypeSizeInBits(AR->getType())); ++ IntegerType::get(Loc->getContext(), SE.getTypeSizeInBits(ARTy)); ++ Type *ARExpandTy = DL.isNonIntegralPointerType(ARTy) ? ARTy : Ty; + + Value *StepValue = expandCodeFor(Step, Ty, Loc); + Value *NegStepValue = expandCodeFor(SE.getNegativeSCEV(Step), Ty, Loc); +- Value *StartValue = expandCodeFor(Start, Ty, Loc); ++ Value *StartValue = expandCodeFor(Start, ARExpandTy, Loc); + + ConstantInt *Zero = + ConstantInt::get(Loc->getContext(), APInt::getNullValue(DstBits)); +@@ -2197,8 +2199,21 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, + // Compute: + // Start + |Step| * Backedge < Start + // Start - |Step| * Backedge > Start +- Value *Add = Builder.CreateAdd(StartValue, MulV); +- Value *Sub = Builder.CreateSub(StartValue, MulV); ++ Value *Add = nullptr, *Sub = nullptr; ++ if (ARExpandTy->isPointerTy()) { ++ PointerType *ARPtrTy = cast(ARExpandTy); ++ const SCEV *MulS = SE.getSCEV(MulV); ++ const SCEV *const StepArray[2] = {MulS, SE.getNegativeSCEV(MulS)}; ++ Add = Builder.CreateBitCast( ++ expandAddToGEP(&StepArray[0], &StepArray[1], ARPtrTy, Ty, StartValue), ++ ARPtrTy); ++ Sub = Builder.CreateBitCast( ++ expandAddToGEP(&StepArray[1], &StepArray[2], ARPtrTy, Ty, StartValue), ++ ARPtrTy); ++ } else { ++ Add = Builder.CreateAdd(StartValue, MulV); ++ Sub = Builder.CreateSub(StartValue, MulV); ++ } + + Value *EndCompareGT = Builder.CreateICmp( + Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue); +diff --git a/test/Analysis/LoopAccessAnalysis/wrapping-pointer-ni.ll b/test/Analysis/LoopAccessAnalysis/wrapping-pointer-ni.ll +new file mode 100644 +index 00000000000..ddcf5e1a195 +--- /dev/null ++++ b/test/Analysis/LoopAccessAnalysis/wrapping-pointer-ni.ll +@@ -0,0 +1,73 @@ ++; RUN: opt -loop-versioning -S < %s | FileCheck %s -check-prefix=LV ++ ++; NB: addrspaces 10-13 are non-integral ++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13" ++ ++; This matches the test case from PR38290 ++; Check that we expand the SCEV predicate check using GEP, rather ++; than ptrtoint. ++ ++%jl_value_t = type opaque ++%jl_array_t = type { i8 addrspace(13)*, i64, i16, i16, i32 } ++ ++declare i64 @julia_steprange_last_4949() ++ ++define void @"japi1_align!_9477"(%jl_value_t addrspace(10)**) #0 { ++; LV-LAVEL: L26.lver.check ++; LV: [[OFMul:%[^ ]*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[Step:%[^ ]*]]) ++; LV-NEXT: [[OFMulResult:%[^ ]*]] = extractvalue { i64, i1 } [[OFMul]], 0 ++; LV-NEXT: [[OFMulOverflow:%[^ ]*]] = extractvalue { i64, i1 } [[OFMul]], 1 ++; LV-NEXT: [[PosGEP:%[^ ]*]] = getelementptr i32, i32 addrspace(13)* [[Base:%[^ ]*]], i64 [[Step]] ++; LV-NEXT: [[NegGEP:%[^ ]*]] = getelementptr i32, i32 addrspace(13)* [[Base]], i64 [[NegStep:%[^ ]*]] ++; LV-NEXT: icmp ugt i32 addrspace(13)* [[NegGEP]], [[Base]] ++; LV-NEXT: icmp ult i32 addrspace(13)* [[PosGEP]], [[Base]] ++; LV-NOT: inttoptr ++; LV-NOT: ptrtoint ++top: ++ %1 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %0, align 8, !nonnull !1, !dereferenceable !2, !align !3 ++ %2 = load i32, i32* inttoptr (i64 12 to i32*), align 4, !tbaa !4 ++ %3 = sub i32 0, %2 ++ %4 = call i64 @julia_steprange_last_4949() ++ %5 = addrspacecast %jl_value_t addrspace(10)* %1 to %jl_value_t addrspace(11)* ++ %6 = bitcast %jl_value_t addrspace(11)* %5 to %jl_value_t addrspace(10)* addrspace(11)* ++ %7 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %6, align 8, !tbaa !4, !nonnull !1, !dereferenceable !9, !align !2 ++ %8 = addrspacecast %jl_value_t addrspace(10)* %7 to %jl_value_t addrspace(11)* ++ %9 = bitcast %jl_value_t addrspace(11)* %8 to i32 addrspace(13)* addrspace(11)* ++ %10 = load i32 addrspace(13)*, i32 addrspace(13)* addrspace(11)* %9, align 8, !tbaa !10, !nonnull !1 ++ %11 = sext i32 %3 to i64 ++ br label %L26 ++ ++L26: ; preds = %L26, %top ++ %value_phi3 = phi i64 [ 0, %top ], [ %12, %L26 ] ++ %12 = add i64 %value_phi3, -1 ++ %13 = getelementptr inbounds i32, i32 addrspace(13)* %10, i64 %12 ++ %14 = load i32, i32 addrspace(13)* %13, align 4, !tbaa !13 ++ %15 = add i64 %12, %11 ++ %16 = getelementptr inbounds i32, i32 addrspace(13)* %10, i64 %15 ++ store i32 %14, i32 addrspace(13)* %16, align 4, !tbaa !13 ++ %17 = icmp eq i64 %value_phi3, %4 ++ br i1 %17, label %L45, label %L26 ++ ++L45: ; preds = %L26 ++ ret void ++} ++ ++attributes #0 = { "thunk" } ++ ++!llvm.module.flags = !{!0} ++ ++!0 = !{i32 1, !"Debug Info Version", i32 3} ++!1 = !{} ++!2 = !{i64 16} ++!3 = !{i64 8} ++!4 = !{!5, !5, i64 0} ++!5 = !{!"jtbaa_mutab", !6, i64 0} ++!6 = !{!"jtbaa_value", !7, i64 0} ++!7 = !{!"jtbaa_data", !8, i64 0} ++!8 = !{!"jtbaa"} ++!9 = !{i64 40} ++!10 = !{!11, !11, i64 0} ++!11 = !{!"jtbaa_arrayptr", !12, i64 0} ++!12 = !{!"jtbaa_array", !8, i64 0} ++!13 = !{!14, !14, i64 0} ++!14 = !{!"jtbaa_arraybuf", !7, i64 0} +diff --git a/test/Analysis/LoopAccessAnalysis/wrapping-pointer-versioning.ll b/test/Analysis/LoopAccessAnalysis/wrapping-pointer-versioning.ll +index a7e5bce7445..fa6fccecbf1 100644 +--- a/test/Analysis/LoopAccessAnalysis/wrapping-pointer-versioning.ll ++++ b/test/Analysis/LoopAccessAnalysis/wrapping-pointer-versioning.ll +@@ -58,10 +58,10 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + ; LV-NEXT: [[OFMul1:%[^ ]*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[BE]]) + ; LV-NEXT: [[OFMulResult1:%[^ ]*]] = extractvalue { i64, i1 } [[OFMul1]], 0 + ; LV-NEXT: [[OFMulOverflow1:%[^ ]*]] = extractvalue { i64, i1 } [[OFMul1]], 1 +-; LV-NEXT: [[AddEnd1:%[^ ]*]] = add i64 %a2, [[OFMulResult1]] +-; LV-NEXT: [[SubEnd1:%[^ ]*]] = sub i64 %a2, [[OFMulResult1]] +-; LV-NEXT: [[CmpNeg1:%[^ ]*]] = icmp ugt i64 [[SubEnd1]], %a2 +-; LV-NEXT: [[CmpPos1:%[^ ]*]] = icmp ult i64 [[AddEnd1]], %a2 ++; LV-NEXT: [[AddEnd1:%[^ ]*]] = add i64 [[A0:%[^ ]*]], [[OFMulResult1]] ++; LV-NEXT: [[SubEnd1:%[^ ]*]] = sub i64 [[A0]], [[OFMulResult1]] ++; LV-NEXT: [[CmpNeg1:%[^ ]*]] = icmp ugt i64 [[SubEnd1]], [[A0]] ++; LV-NEXT: [[CmpPos1:%[^ ]*]] = icmp ult i64 [[AddEnd1]], [[A0]] + ; LV-NEXT: [[Cmp:%[^ ]*]] = select i1 false, i1 [[CmpNeg1]], i1 [[CmpPos1]] + ; LV-NEXT: [[PredCheck1:%[^ ]*]] = or i1 [[Cmp]], [[OFMulOverflow1]] + +@@ -233,10 +233,10 @@ for.end: ; preds = %for.body + ; LV: [[OFMul1:%[^ ]*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[BE:%[^ ]*]]) + ; LV-NEXT: [[OFMulResult1:%[^ ]*]] = extractvalue { i64, i1 } [[OFMul1]], 0 + ; LV-NEXT: [[OFMulOverflow1:%[^ ]*]] = extractvalue { i64, i1 } [[OFMul1]], 1 +-; LV-NEXT: [[AddEnd1:%[^ ]*]] = add i64 %a2, [[OFMulResult1]] +-; LV-NEXT: [[SubEnd1:%[^ ]*]] = sub i64 %a2, [[OFMulResult1]] +-; LV-NEXT: [[CmpNeg1:%[^ ]*]] = icmp ugt i64 [[SubEnd1]], %a2 +-; LV-NEXT: [[CmpPos1:%[^ ]*]] = icmp ult i64 [[AddEnd1]], %a2 ++; LV-NEXT: [[AddEnd1:%[^ ]*]] = add i64 [[A0:%[^ ]*]], [[OFMulResult1]] ++; LV-NEXT: [[SubEnd1:%[^ ]*]] = sub i64 [[A0]], [[OFMulResult1]] ++; LV-NEXT: [[CmpNeg1:%[^ ]*]] = icmp ugt i64 [[SubEnd1]], [[A0]] ++; LV-NEXT: [[CmpPos1:%[^ ]*]] = icmp ult i64 [[AddEnd1]], [[A0]] + ; LV-NEXT: [[Cmp:%[^ ]*]] = select i1 false, i1 [[CmpNeg1]], i1 [[CmpPos1]] + ; LV-NEXT: [[PredCheck1:%[^ ]*]] = or i1 [[Cmp]], [[OFMulOverflow1]] + diff --git a/patches/llvm-D50010-VNCoercion-ni.patch b/patches/llvm-D50010-VNCoercion-ni.patch new file mode 100644 index 0000000..cb658d1 --- /dev/null +++ b/patches/llvm-D50010-VNCoercion-ni.patch @@ -0,0 +1,89 @@ +commit 8eb2b102a203d83fb713f3bf79acf235dabdd8cd +Author: Keno Fischer +Date: Mon Jul 30 16:59:08 2018 -0400 + + [VNCoercion] Disallow coercion between different ni addrspaces + + Summary: + I'm not sure if it would be legal by the IR reference to introduce + an addrspacecast here, since the IR reference is a bit vague on + the exact semantics, but at least for our usage of it (and I + suspect for many other's usage) it is not. For us, addrspacecasts + between non-integral address spaces carry frontend information that the + optimizer cannot deduce afterwards in a generic way (though we + have frontend specific passes in our pipline that do propagate + these). In any case, I'm sure nobody is using it this way at + the moment, since it would have introduced inttoptrs, which + are definitely illegal. + + Fixes PR38375 + + Reviewers: sanjoy, reames, dberlin + + Subscribers: llvm-commits + + Differential Revision: https://reviews.llvm.org/D50010 + +diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp +index c3feea6a0a4..735d1e7b792 100644 +--- a/lib/Transforms/Utils/VNCoercion.cpp ++++ b/lib/Transforms/Utils/VNCoercion.cpp +@@ -20,14 +20,21 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, + StoredVal->getType()->isStructTy() || StoredVal->getType()->isArrayTy()) + return false; + ++ Type *StoredValTy = StoredVal->getType(); ++ + // The store has to be at least as big as the load. + if (DL.getTypeSizeInBits(StoredVal->getType()) < DL.getTypeSizeInBits(LoadTy)) + return false; + +- // Don't coerce non-integral pointers to integers or vice versa. +- if (DL.isNonIntegralPointerType(StoredVal->getType()) != +- DL.isNonIntegralPointerType(LoadTy)) ++ bool StoredNI = DL.isNonIntegralPointerType(StoredValTy); ++ bool LoadNI = DL.isNonIntegralPointerType(LoadTy); ++ if (StoredNI != LoadNI) { + return false; ++ } else if (StoredNI && LoadNI && ++ cast(StoredValTy)->getAddressSpace() != ++ cast(LoadTy)->getAddressSpace()) { ++ return false; ++ } + + return true; + } +diff --git a/test/Transforms/GVN/non-integral-pointers.ll b/test/Transforms/GVN/non-integral-pointers.ll +index 9ae4132231d..5217fc1a06a 100644 +--- a/test/Transforms/GVN/non-integral-pointers.ll ++++ b/test/Transforms/GVN/non-integral-pointers.ll +@@ -1,6 +1,6 @@ + ; RUN: opt -gvn -S < %s | FileCheck %s + +-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:4" ++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:4:5" + target triple = "x86_64-unknown-linux-gnu" + + define void @f0(i1 %alwaysFalse, i64 %val, i64* %loc) { +@@ -37,3 +37,21 @@ define i64 @f1(i1 %alwaysFalse, i8 addrspace(4)* %val, i8 addrspace(4)** %loc) { + alwaysTaken: + ret i64 42 + } ++ ++ define i8 addrspace(5)* @multini(i1 %alwaysFalse, i8 addrspace(4)* %val, i8 addrspace(4)** %loc) { ++ ; CHECK-LABEL: @multini( ++ ; CHECK-NOT: inttoptr ++ ; CHECK-NOT: ptrtoint ++ ; CHECK-NOT: addrspacecast ++ entry: ++ store i8 addrspace(4)* %val, i8 addrspace(4)** %loc ++ br i1 %alwaysFalse, label %neverTaken, label %alwaysTaken ++ ++ neverTaken: ++ %loc.bc = bitcast i8 addrspace(4)** %loc to i8 addrspace(5)** ++ %differentas = load i8 addrspace(5)*, i8 addrspace(5)** %loc.bc ++ ret i8 addrspace(5)* %differentas ++ ++ alwaysTaken: ++ ret i8 addrspace(5)* null ++ } diff --git a/patches/llvm-D50167-scev-umin.patch b/patches/llvm-D50167-scev-umin.patch new file mode 100644 index 0000000..5a968a4 --- /dev/null +++ b/patches/llvm-D50167-scev-umin.patch @@ -0,0 +1,1153 @@ +commit 556c30af1c797be294edde0ce621884f5acf11f0 +Author: Keno Fischer +Date: Wed Aug 1 20:45:11 2018 -0400 + + RFC: [SCEV] Add explicit representations of umin/smin + + Summary: + Currently we express umin as `~umax(~x, ~y)`. However, this becomes + a problem for operands in non-integral pointer spaces, because `~x` + is not something we can compute for `x` non-integral. However, since + comparisons are generally still allowed, we are actually able to + express `umin(x, y)` directly as long as we don't try to express is + as a umax. Support this by adding an explicit umin/smin representation + to SCEV. We do this by factoring the existing getUMax/getSMax functions + into a new function that does all four. The previous two functions + were largely identical, except that the SMax variant used `isKnownPredicate` + while the UMax variant used `isKnownViaNonRecursiveReasoning`. + + Trying to make the UMax variant also use `isKnownPredicate` yields to + an infinite recursion, while trying to make the `SMax` variant use + `isKnownViaNonRecursiveReasoning` causes + `Transforms/IndVarSimplify/backedge-on-min-max.ll` to fail. + + I would appreciate any insight into which predicate is correct here. + + Reviewers: reames, sanjoy + + Subscribers: javed.absar, llvm-commits + + Differential Revision: https://reviews.llvm.org/D50167 + +diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h +index 21b72f3e13c..9fd6794395c 100644 +--- a/include/llvm/Analysis/ScalarEvolution.h ++++ b/include/llvm/Analysis/ScalarEvolution.h +@@ -582,12 +582,15 @@ public: + /// \p IndexExprs The expressions for the indices. + const SCEV *getGEPExpr(GEPOperator *GEP, + const SmallVectorImpl &IndexExprs); ++ const SCEV *getUSMinMaxExpr(unsigned Kind, SmallVectorImpl &Operands); + const SCEV *getSMaxExpr(const SCEV *LHS, const SCEV *RHS); + const SCEV *getSMaxExpr(SmallVectorImpl &Operands); + const SCEV *getUMaxExpr(const SCEV *LHS, const SCEV *RHS); + const SCEV *getUMaxExpr(SmallVectorImpl &Operands); + const SCEV *getSMinExpr(const SCEV *LHS, const SCEV *RHS); ++ const SCEV *getSMinExpr(SmallVectorImpl &Operands); + const SCEV *getUMinExpr(const SCEV *LHS, const SCEV *RHS); ++ const SCEV *getUMinExpr(SmallVectorImpl &Operands); + const SCEV *getUnknown(Value *V); + const SCEV *getCouldNotCompute(); + +diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h +index 3df04e98bd2..9e407c63abc 100644 +--- a/include/llvm/Analysis/ScalarEvolutionExpander.h ++++ b/include/llvm/Analysis/ScalarEvolutionExpander.h +@@ -367,6 +367,10 @@ namespace llvm { + + Value *visitUMaxExpr(const SCEVUMaxExpr *S); + ++ Value *visitSMinExpr(const SCEVSMinExpr *S); ++ ++ Value *visitUMinExpr(const SCEVUMinExpr *S); ++ + Value *visitUnknown(const SCEVUnknown *S) { + return S->getValue(); + } +diff --git a/include/llvm/Analysis/ScalarEvolutionExpressions.h b/include/llvm/Analysis/ScalarEvolutionExpressions.h +index acf83455cdc..0d20a1bcdcc 100644 +--- a/include/llvm/Analysis/ScalarEvolutionExpressions.h ++++ b/include/llvm/Analysis/ScalarEvolutionExpressions.h +@@ -40,7 +40,7 @@ class Type; + // These should be ordered in terms of increasing complexity to make the + // folders simpler. + scConstant, scTruncate, scZeroExtend, scSignExtend, scAddExpr, scMulExpr, +- scUDivExpr, scAddRecExpr, scUMaxExpr, scSMaxExpr, ++ scUDivExpr, scAddRecExpr, scUMaxExpr, scSMaxExpr, scUMinExpr, scSMinExpr, + scUnknown, scCouldNotCompute + }; + +@@ -187,6 +187,8 @@ class Type; + S->getSCEVType() == scMulExpr || + S->getSCEVType() == scSMaxExpr || + S->getSCEVType() == scUMaxExpr || ++ S->getSCEVType() == scSMinExpr || ++ S->getSCEVType() == scUMinExpr || + S->getSCEVType() == scAddRecExpr; + } + }; +@@ -204,7 +206,9 @@ class Type; + return S->getSCEVType() == scAddExpr || + S->getSCEVType() == scMulExpr || + S->getSCEVType() == scSMaxExpr || +- S->getSCEVType() == scUMaxExpr; ++ S->getSCEVType() == scUMaxExpr || ++ S->getSCEVType() == scSMinExpr || ++ S->getSCEVType() == scUMinExpr; + } + + /// Set flags for a non-recurrence without clearing previously set flags. +@@ -396,6 +400,42 @@ class Type; + } + }; + ++ /// This class represents a signed minimum selection. ++ class SCEVSMinExpr : public SCEVCommutativeExpr { ++ friend class ScalarEvolution; ++ ++ SCEVSMinExpr(const FoldingSetNodeIDRef ID, ++ const SCEV *const *O, size_t N) ++ : SCEVCommutativeExpr(ID, scSMinExpr, O, N) { ++ // Min never overflows. ++ setNoWrapFlags((NoWrapFlags)(FlagNUW | FlagNSW)); ++ } ++ ++ public: ++ /// Methods for support type inquiry through isa, cast, and dyn_cast: ++ static bool classof(const SCEV *S) { ++ return S->getSCEVType() == scSMinExpr; ++ } ++ }; ++ ++ /// This class represents an unsigned minimum selection. ++ class SCEVUMinExpr : public SCEVCommutativeExpr { ++ friend class ScalarEvolution; ++ ++ SCEVUMinExpr(const FoldingSetNodeIDRef ID, ++ const SCEV *const *O, size_t N) ++ : SCEVCommutativeExpr(ID, scUMinExpr, O, N) { ++ // Min never overflows. ++ setNoWrapFlags((NoWrapFlags)(FlagNUW | FlagNSW)); ++ } ++ ++ public: ++ /// Methods for support type inquiry through isa, cast, and dyn_cast: ++ static bool classof(const SCEV *S) { ++ return S->getSCEVType() == scUMinExpr; ++ } ++ }; ++ + /// This means that we are dealing with an entirely unknown SCEV + /// value, and only represent it as its LLVM Value. This is the + /// "bottom" value for the analysis. +@@ -468,6 +508,10 @@ class Type; + return ((SC*)this)->visitSMaxExpr((const SCEVSMaxExpr*)S); + case scUMaxExpr: + return ((SC*)this)->visitUMaxExpr((const SCEVUMaxExpr*)S); ++ case scSMinExpr: ++ return ((SC*)this)->visitSMinExpr((const SCEVSMinExpr*)S); ++ case scUMinExpr: ++ return ((SC*)this)->visitUMinExpr((const SCEVUMinExpr*)S); + case scUnknown: + return ((SC*)this)->visitUnknown((const SCEVUnknown*)S); + case scCouldNotCompute: +@@ -521,6 +565,8 @@ class Type; + case scMulExpr: + case scSMaxExpr: + case scUMaxExpr: ++ case scSMinExpr: ++ case scUMinExpr: + case scAddRecExpr: + for (const auto *Op : cast(S)->operands()) + push(Op); +@@ -683,6 +729,26 @@ class Type; + return !Changed ? Expr : SE.getUMaxExpr(Operands); + } + ++ const SCEV *visitSMinExpr(const SCEVSMinExpr *Expr) { ++ SmallVector Operands; ++ bool Changed = false; ++ for (auto *Op : Expr->operands()) { ++ Operands.push_back(((SC *)this)->visit(Op)); ++ Changed |= Op != Operands.back(); ++ } ++ return !Changed ? Expr : SE.getSMinExpr(Operands); ++ } ++ ++ const SCEV *visitUMinExpr(const SCEVUMinExpr *Expr) { ++ SmallVector Operands; ++ bool Changed = false; ++ for (auto *Op : Expr->operands()) { ++ Operands.push_back(((SC*)this)->visit(Op)); ++ Changed |= Op != Operands.back(); ++ } ++ return !Changed ? Expr : SE.getUMinExpr(Operands); ++ } ++ + const SCEV *visitUnknown(const SCEVUnknown *Expr) { + return Expr; + } +diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp +index bfff7afb5b4..750c1fdfdfb 100644 +--- a/lib/Analysis/ScalarEvolution.cpp ++++ b/lib/Analysis/ScalarEvolution.cpp +@@ -271,7 +271,9 @@ void SCEV::print(raw_ostream &OS) const { + case scAddExpr: + case scMulExpr: + case scUMaxExpr: +- case scSMaxExpr: { ++ case scSMaxExpr: ++ case scUMinExpr: ++ case scSMinExpr: { + const SCEVNAryExpr *NAry = cast(this); + const char *OpStr = nullptr; + switch (NAry->getSCEVType()) { +@@ -279,6 +281,8 @@ void SCEV::print(raw_ostream &OS) const { + case scMulExpr: OpStr = " * "; break; + case scUMaxExpr: OpStr = " umax "; break; + case scSMaxExpr: OpStr = " smax "; break; ++ case scUMinExpr: OpStr = " umin "; break; ++ case scSMinExpr: OpStr = " smin "; break; + } + OS << "("; + for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end(); +@@ -347,6 +351,8 @@ Type *SCEV::getType() const { + case scMulExpr: + case scUMaxExpr: + case scSMaxExpr: ++ case scUMinExpr: ++ case scSMinExpr: + return cast(this)->getType(); + case scAddExpr: + return cast(this)->getType(); +@@ -718,7 +724,9 @@ static int CompareSCEVComplexity( + case scAddExpr: + case scMulExpr: + case scSMaxExpr: +- case scUMaxExpr: { ++ case scUMaxExpr: ++ case scSMinExpr: ++ case scUMinExpr: { + const SCEVNAryExpr *LC = cast(LHS); + const SCEVNAryExpr *RC = cast(RHS); + +@@ -922,6 +930,8 @@ public: + void visitUDivExpr(const SCEVUDivExpr *Numerator) {} + void visitSMaxExpr(const SCEVSMaxExpr *Numerator) {} + void visitUMaxExpr(const SCEVUMaxExpr *Numerator) {} ++ void visitSMinExpr(const SCEVSMinExpr *Numerator) {} ++ void visitUMinExpr(const SCEVUMinExpr *Numerator) {} + void visitUnknown(const SCEVUnknown *Numerator) {} + void visitCouldNotCompute(const SCEVCouldNotCompute *Numerator) {} + +@@ -2276,6 +2286,8 @@ bool ScalarEvolution::isAvailableAtLoopEntry(const SCEV *S, const Loop *L) { + case scMulExpr: + case scUMaxExpr: + case scSMaxExpr: ++ case scUMinExpr: ++ case scSMinExpr: + case scUDivExpr: + return true; + case scUnknown: +@@ -3405,23 +3417,20 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP, + return getAddExpr(BaseExpr, TotalOffset, Wrap); + } + +-const SCEV *ScalarEvolution::getSMaxExpr(const SCEV *LHS, +- const SCEV *RHS) { +- SmallVector Ops = {LHS, RHS}; +- return getSMaxExpr(Ops); +-} +- + const SCEV * +-ScalarEvolution::getSMaxExpr(SmallVectorImpl &Ops) { +- assert(!Ops.empty() && "Cannot get empty smax!"); ++ScalarEvolution::getUSMinMaxExpr(unsigned Kind, SmallVectorImpl &Ops) { ++ assert(!Ops.empty() && "Cannot get empty (u|s)(min|max)!"); + if (Ops.size() == 1) return Ops[0]; + #ifndef NDEBUG + Type *ETy = getEffectiveSCEVType(Ops[0]->getType()); + for (unsigned i = 1, e = Ops.size(); i != e; ++i) + assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy && +- "SCEVSMaxExpr operand types don't match!"); ++ "Operand types don't match!"); + #endif + ++ bool IsSigned = Kind == scSMaxExpr || Kind == scSMinExpr; ++ bool IsMax = Kind == scSMaxExpr || Kind == scUMaxExpr; ++ + // Sort by complexity, this groups all similar expression types together. + GroupByComplexity(Ops, &LI, DT); + +@@ -3430,61 +3439,85 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl &Ops) { + if (const SCEVConstant *LHSC = dyn_cast(Ops[0])) { + ++Idx; + assert(Idx < Ops.size()); ++ auto &FoldOp = ++ Kind == scSMaxExpr ? APIntOps::smax : ++ Kind == scSMinExpr ? APIntOps::smin : ++ Kind == scUMaxExpr ? APIntOps::umax : ++ APIntOps::umin; + while (const SCEVConstant *RHSC = dyn_cast(Ops[Idx])) { + // We found two constants, fold them together! + ConstantInt *Fold = ConstantInt::get( +- getContext(), APIntOps::smax(LHSC->getAPInt(), RHSC->getAPInt())); ++ getContext(), FoldOp(LHSC->getAPInt(), RHSC->getAPInt())); + Ops[0] = getConstant(Fold); + Ops.erase(Ops.begin()+1); // Erase the folded element + if (Ops.size() == 1) return Ops[0]; + LHSC = cast(Ops[0]); + } + +- // If we are left with a constant minimum-int, strip it off. +- if (cast(Ops[0])->getValue()->isMinValue(true)) { +- Ops.erase(Ops.begin()); +- --Idx; +- } else if (cast(Ops[0])->getValue()->isMaxValue(true)) { +- // If we have an smax with a constant maximum-int, it will always be +- // maximum-int. +- return Ops[0]; ++ if (IsMax) { ++ // If we are left with a constant minimum-int, strip it off. ++ if (cast(Ops[0])->getValue()->isMinValue(IsSigned)) { ++ Ops.erase(Ops.begin()); ++ --Idx; ++ } else if (cast(Ops[0])->getValue()->isMaxValue(IsSigned)) { ++ // If we have an smax with a constant maximum-int, it will always be ++ // maximum-int. ++ return Ops[0]; ++ } ++ } else { ++ // If we are left with a constant maximum-int, strip it off. ++ if (cast(Ops[0])->getValue()->isMaxValue(IsSigned)) { ++ Ops.erase(Ops.begin()); ++ --Idx; ++ } else if (cast(Ops[0])->getValue()->isMinValue(IsSigned)) { ++ // If we have an smax with a constant minimum-int, it will always be ++ // maximum-int. ++ return Ops[0]; ++ } + } + + if (Ops.size() == 1) return Ops[0]; + } + +- // Find the first SMax +- while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scSMaxExpr) ++ // Find the first operation of the same kind ++ while (Idx < Ops.size() && Ops[Idx]->getSCEVType() != Kind) + ++Idx; + + // Check to see if one of the operands is an SMax. If so, expand its operands + // onto our operand list, and recurse to simplify. + if (Idx < Ops.size()) { +- bool DeletedSMax = false; +- while (const SCEVSMaxExpr *SMax = dyn_cast(Ops[Idx])) { ++ bool DeletedAny = false; ++ while (Ops[Idx]->getSCEVType() == Kind) { ++ const SCEVCommutativeExpr *SCE = cast(Ops[Idx]); + Ops.erase(Ops.begin()+Idx); +- Ops.append(SMax->op_begin(), SMax->op_end()); +- DeletedSMax = true; ++ Ops.append(SCE->op_begin(), SCE->op_end()); ++ DeletedAny = true; + } + +- if (DeletedSMax) +- return getSMaxExpr(Ops); ++ if (DeletedAny) ++ return getUSMinMaxExpr(Kind, Ops); + } + + // Okay, check to see if the same value occurs in the operand list twice. If + // so, delete one. Since we sorted the list, these values are required to + // be adjacent. +- for (unsigned i = 0, e = Ops.size()-1; i != e; ++i) +- // X smax Y smax Y --> X smax Y +- // X smax Y --> X, if X is always greater than Y +- if (Ops[i] == Ops[i+1] || +- isKnownPredicate(ICmpInst::ICMP_SGE, Ops[i], Ops[i+1])) { +- Ops.erase(Ops.begin()+i+1, Ops.begin()+i+2); +- --i; --e; +- } else if (isKnownPredicate(ICmpInst::ICMP_SLE, Ops[i], Ops[i+1])) { +- Ops.erase(Ops.begin()+i, Ops.begin()+i+1); +- --i; --e; +- } ++ llvm::CmpInst::Predicate GEPred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; ++ llvm::CmpInst::Predicate LEPred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; ++ llvm::CmpInst::Predicate FirstPred = IsMax ? GEPred : LEPred; ++ llvm::CmpInst::Predicate SecondPred = IsMax ? LEPred : GEPred; ++ for (unsigned i = 0, e = Ops.size()-1; i != e; ++i) { ++ if (Ops[i] == Ops[i+1] || ++ isKnownPredicate(FirstPred, Ops[i], Ops[i+1])) { ++ // X op Y op Y --> X op Y ++ // X op Y --> X, if we know X, Y are ordered appropriately ++ Ops.erase(Ops.begin()+i+1, Ops.begin()+i+2); ++ --i; --e; ++ } else if (isKnownPredicate(SecondPred, Ops[i], Ops[i+1])) { ++ // X op Y --> Y, if we know X, Y are ordered appropriately ++ Ops.erase(Ops.begin()+i, Ops.begin()+i+1); ++ --i; --e; ++ } ++ } + + if (Ops.size() == 1) return Ops[0]; + +@@ -3493,132 +3526,73 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl &Ops) { + // Okay, it looks like we really DO need an smax expr. Check to see if we + // already have one, otherwise create a new one. + FoldingSetNodeID ID; +- ID.AddInteger(scSMaxExpr); ++ ID.AddInteger(Kind); + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + ID.AddPointer(Ops[i]); + void *IP = nullptr; + if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; + const SCEV **O = SCEVAllocator.Allocate(Ops.size()); + std::uninitialized_copy(Ops.begin(), Ops.end(), O); +- SCEV *S = new (SCEVAllocator) SCEVSMaxExpr(ID.Intern(SCEVAllocator), +- O, Ops.size()); ++ SCEV *S = nullptr; ++ ++ if (Kind == scSMaxExpr) { ++ S = new (SCEVAllocator) SCEVSMaxExpr(ID.Intern(SCEVAllocator), ++ O, Ops.size()); ++ } else if (Kind == scUMaxExpr) { ++ S = new (SCEVAllocator) SCEVUMaxExpr(ID.Intern(SCEVAllocator), ++ O, Ops.size()); ++ } else if (Kind == scSMinExpr) { ++ S = new (SCEVAllocator) SCEVSMinExpr(ID.Intern(SCEVAllocator), ++ O, Ops.size()); ++ } else { ++ assert(Kind == scUMinExpr); ++ S = new (SCEVAllocator) SCEVUMinExpr(ID.Intern(SCEVAllocator), ++ O, Ops.size()); ++ } ++ + UniqueSCEVs.InsertNode(S, IP); + addToLoopUseLists(S); + return S; + } + +-const SCEV *ScalarEvolution::getUMaxExpr(const SCEV *LHS, ++const SCEV *ScalarEvolution::getSMaxExpr(const SCEV *LHS, + const SCEV *RHS) { + SmallVector Ops = {LHS, RHS}; +- return getUMaxExpr(Ops); ++ return getSMaxExpr(Ops); + } + +-const SCEV * +-ScalarEvolution::getUMaxExpr(SmallVectorImpl &Ops) { +- assert(!Ops.empty() && "Cannot get empty umax!"); +- if (Ops.size() == 1) return Ops[0]; +-#ifndef NDEBUG +- Type *ETy = getEffectiveSCEVType(Ops[0]->getType()); +- for (unsigned i = 1, e = Ops.size(); i != e; ++i) +- assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy && +- "SCEVUMaxExpr operand types don't match!"); +-#endif +- +- // Sort by complexity, this groups all similar expression types together. +- GroupByComplexity(Ops, &LI, DT); +- +- // If there are any constants, fold them together. +- unsigned Idx = 0; +- if (const SCEVConstant *LHSC = dyn_cast(Ops[0])) { +- ++Idx; +- assert(Idx < Ops.size()); +- while (const SCEVConstant *RHSC = dyn_cast(Ops[Idx])) { +- // We found two constants, fold them together! +- ConstantInt *Fold = ConstantInt::get( +- getContext(), APIntOps::umax(LHSC->getAPInt(), RHSC->getAPInt())); +- Ops[0] = getConstant(Fold); +- Ops.erase(Ops.begin()+1); // Erase the folded element +- if (Ops.size() == 1) return Ops[0]; +- LHSC = cast(Ops[0]); +- } +- +- // If we are left with a constant minimum-int, strip it off. +- if (cast(Ops[0])->getValue()->isMinValue(false)) { +- Ops.erase(Ops.begin()); +- --Idx; +- } else if (cast(Ops[0])->getValue()->isMaxValue(false)) { +- // If we have an umax with a constant maximum-int, it will always be +- // maximum-int. +- return Ops[0]; +- } +- +- if (Ops.size() == 1) return Ops[0]; +- } +- +- // Find the first UMax +- while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scUMaxExpr) +- ++Idx; +- +- // Check to see if one of the operands is a UMax. If so, expand its operands +- // onto our operand list, and recurse to simplify. +- if (Idx < Ops.size()) { +- bool DeletedUMax = false; +- while (const SCEVUMaxExpr *UMax = dyn_cast(Ops[Idx])) { +- Ops.erase(Ops.begin()+Idx); +- Ops.append(UMax->op_begin(), UMax->op_end()); +- DeletedUMax = true; +- } +- +- if (DeletedUMax) +- return getUMaxExpr(Ops); +- } +- +- // Okay, check to see if the same value occurs in the operand list twice. If +- // so, delete one. Since we sorted the list, these values are required to +- // be adjacent. +- for (unsigned i = 0, e = Ops.size()-1; i != e; ++i) +- // X umax Y umax Y --> X umax Y +- // X umax Y --> X, if X is always greater than Y +- if (Ops[i] == Ops[i+1] || +- isKnownPredicate(ICmpInst::ICMP_UGE, Ops[i], Ops[i+1])) { +- Ops.erase(Ops.begin()+i+1, Ops.begin()+i+2); +- --i; --e; +- } else if (isKnownPredicate(ICmpInst::ICMP_ULE, Ops[i], Ops[i+1])) { +- Ops.erase(Ops.begin()+i, Ops.begin()+i+1); +- --i; --e; +- } +- +- if (Ops.size() == 1) return Ops[0]; ++const SCEV *ScalarEvolution::getSMaxExpr(SmallVectorImpl &Ops) { ++ return getUSMinMaxExpr(scSMaxExpr, Ops); ++} + +- assert(!Ops.empty() && "Reduced umax down to nothing!"); ++const SCEV *ScalarEvolution::getUMaxExpr(const SCEV *LHS, ++ const SCEV *RHS) { ++ SmallVector Ops = {LHS, RHS}; ++ return getUMaxExpr(Ops); ++} + +- // Okay, it looks like we really DO need a umax expr. Check to see if we +- // already have one, otherwise create a new one. +- FoldingSetNodeID ID; +- ID.AddInteger(scUMaxExpr); +- for (unsigned i = 0, e = Ops.size(); i != e; ++i) +- ID.AddPointer(Ops[i]); +- void *IP = nullptr; +- if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; +- const SCEV **O = SCEVAllocator.Allocate(Ops.size()); +- std::uninitialized_copy(Ops.begin(), Ops.end(), O); +- SCEV *S = new (SCEVAllocator) SCEVUMaxExpr(ID.Intern(SCEVAllocator), +- O, Ops.size()); +- UniqueSCEVs.InsertNode(S, IP); +- addToLoopUseLists(S); +- return S; ++const SCEV *ScalarEvolution::getUMaxExpr(SmallVectorImpl &Ops) { ++ return getUSMinMaxExpr(scUMaxExpr, Ops); + } + + const SCEV *ScalarEvolution::getSMinExpr(const SCEV *LHS, + const SCEV *RHS) { +- // ~smax(~x, ~y) == smin(x, y). +- return getNotSCEV(getSMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS))); ++ SmallVector Ops = { LHS, RHS }; ++ return getSMinExpr(Ops); ++} ++ ++const SCEV *ScalarEvolution::getSMinExpr(SmallVectorImpl &Ops) { ++ return getUSMinMaxExpr(scSMinExpr, Ops); + } + + const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS, + const SCEV *RHS) { +- // ~umax(~x, ~y) == umin(x, y) +- return getNotSCEV(getUMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS))); ++ SmallVector Ops = { LHS, RHS }; ++ return getUMinExpr(Ops); ++} ++ ++const SCEV *ScalarEvolution::getUMinExpr(SmallVectorImpl &Ops) { ++ return getUSMinMaxExpr(scUMinExpr, Ops); + } + + const SCEV *ScalarEvolution::getSizeOfExpr(Type *IntTy, Type *AllocTy) { +@@ -5002,6 +4976,7 @@ static bool IsAvailableOnEntry(const Loop *L, DominatorTree &DT, const SCEV *S, + switch (S->getSCEVType()) { + case scConstant: case scTruncate: case scZeroExtend: case scSignExtend: + case scAddExpr: case scMulExpr: case scUMaxExpr: case scSMaxExpr: ++ case scUMinExpr: case scSMinExpr: + // These expressions are available if their operand(s) is/are. + return true; + +@@ -7885,7 +7860,9 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) { + } + case scSMaxExpr: + case scUMaxExpr: +- break; // TODO: smax, umax. ++ case scSMinExpr: ++ case scUMinExpr: ++ break; // TODO: smax, umax, smin, umax. + } + return nullptr; + } +@@ -8015,6 +7992,10 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { + return getSMaxExpr(NewOps); + if (isa(Comm)) + return getUMaxExpr(NewOps); ++ if (isa(Comm)) ++ return getSMinExpr(NewOps); ++ if (isa(Comm)) ++ return getUMinExpr(NewOps); + llvm_unreachable("Unknown commutative SCEV type!"); + } + } +@@ -10998,7 +10979,9 @@ ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) { + case scAddExpr: + case scMulExpr: + case scUMaxExpr: +- case scSMaxExpr: { ++ case scSMaxExpr: ++ case scUMinExpr: ++ case scSMinExpr: { + bool HasVarying = false; + for (auto *Op : cast(S)->operands()) { + LoopDisposition D = getLoopDisposition(Op, L); +@@ -11085,7 +11068,9 @@ ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) { + case scAddExpr: + case scMulExpr: + case scUMaxExpr: +- case scSMaxExpr: { ++ case scSMaxExpr: ++ case scUMinExpr: ++ case scSMinExpr: { + const SCEVNAryExpr *NAry = cast(S); + bool Proper = true; + for (const SCEV *NAryOp : NAry->operands()) { +diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp +index 01a8732b0b8..8160a1eaa0b 100644 +--- a/lib/Analysis/ScalarEvolutionExpander.cpp ++++ b/lib/Analysis/ScalarEvolutionExpander.cpp +@@ -1634,14 +1634,15 @@ Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) { + for (int i = S->getNumOperands()-2; i >= 0; --i) { + // In the case of mixed integer and pointer types, do the + // rest of the comparisons as integer. +- if (S->getOperand(i)->getType() != Ty) { ++ Type *OpTy = S->getOperand(i)->getType(); ++ if (OpTy->isIntegerTy() != Ty->isIntegerTy()) { + Ty = SE.getEffectiveSCEVType(Ty); + LHS = InsertNoopCastOfTo(LHS, Ty); + } + Value *RHS = expandCodeFor(S->getOperand(i), Ty); + Value *ICmp = Builder.CreateICmpSGT(LHS, RHS); + rememberInstruction(ICmp); +- Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smax"); ++ Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smin"); + rememberInstruction(Sel); + LHS = Sel; + } +@@ -1658,14 +1659,15 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { + for (int i = S->getNumOperands()-2; i >= 0; --i) { + // In the case of mixed integer and pointer types, do the + // rest of the comparisons as integer. +- if (S->getOperand(i)->getType() != Ty) { ++ Type *OpTy = S->getOperand(i)->getType(); ++ if (OpTy->isIntegerTy() != Ty->isIntegerTy()) { + Ty = SE.getEffectiveSCEVType(Ty); + LHS = InsertNoopCastOfTo(LHS, Ty); + } + Value *RHS = expandCodeFor(S->getOperand(i), Ty); + Value *ICmp = Builder.CreateICmpUGT(LHS, RHS); + rememberInstruction(ICmp); +- Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umax"); ++ Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umin"); + rememberInstruction(Sel); + LHS = Sel; + } +@@ -1671,6 +1671,56 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { + return LHS; + } + ++Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) { ++ Value *LHS = expand(S->getOperand(S->getNumOperands()-1)); ++ Type *Ty = LHS->getType(); ++ for (int i = S->getNumOperands()-2; i >= 0; --i) { ++ // In the case of mixed integer and pointer types, do the ++ // rest of the comparisons as integer. ++ Type *OpTy = S->getOperand(i)->getType(); ++ if (OpTy->isIntegerTy() != Ty->isIntegerTy()) { ++ Ty = SE.getEffectiveSCEVType(Ty); ++ LHS = InsertNoopCastOfTo(LHS, Ty); ++ } ++ Value *RHS = expandCodeFor(S->getOperand(i), Ty); ++ Value *ICmp = Builder.CreateICmpSLT(LHS, RHS); ++ rememberInstruction(ICmp); ++ Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smax"); ++ rememberInstruction(Sel); ++ LHS = Sel; ++ } ++ // In the case of mixed integer and pointer types, cast the ++ // final result back to the pointer type. ++ if (LHS->getType() != S->getType()) ++ LHS = InsertNoopCastOfTo(LHS, S->getType()); ++ return LHS; ++} ++ ++Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) { ++ Value *LHS = expand(S->getOperand(S->getNumOperands()-1)); ++ Type *Ty = LHS->getType(); ++ for (int i = S->getNumOperands()-2; i >= 0; --i) { ++ // In the case of mixed integer and pointer types, do the ++ // rest of the comparisons as integer. ++ Type *OpTy = S->getOperand(i)->getType(); ++ if (OpTy->isIntegerTy() != Ty->isIntegerTy()) { ++ Ty = SE.getEffectiveSCEVType(Ty); ++ LHS = InsertNoopCastOfTo(LHS, Ty); ++ } ++ Value *RHS = expandCodeFor(S->getOperand(i), Ty); ++ Value *ICmp = Builder.CreateICmpULT(LHS, RHS); ++ rememberInstruction(ICmp); ++ Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umax"); ++ rememberInstruction(Sel); ++ LHS = Sel; ++ } ++ // In the case of mixed integer and pointer types, cast the ++ // final result back to the pointer type. ++ if (LHS->getType() != S->getType()) ++ LHS = InsertNoopCastOfTo(LHS, S->getType()); ++ return LHS; ++} ++ + Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty, + Instruction *IP) { + setInsertPoint(IP); +diff --git a/test/Analysis/LoopAccessAnalysis/memcheck-ni.ll b/test/Analysis/LoopAccessAnalysis/memcheck-ni.ll +new file mode 100644 +index 00000000000..a08632f38d1 +--- /dev/null ++++ b/test/Analysis/LoopAccessAnalysis/memcheck-ni.ll +@@ -0,0 +1,50 @@ ++; RUN: opt -loop-versioning -S < %s | FileCheck %s ++ ++; NB: addrspaces 10-13 are non-integral ++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13" ++ ++%jl_value_t = type opaque ++%jl_array_t = type { i8 addrspace(13)*, i64, i16, i16, i32 } ++ ++define void @"japi1_permutedims!_33509"(%jl_value_t addrspace(10)**) { ++; CHECK: [[CMP:%[^ ]*]] = icmp ult double addrspace(13)* [[A:%[^ ]*]], [[B:%[^ ]*]] ++; CHECK: [[SELECT:%[^ ]*]] = select i1 %18, double addrspace(13)* [[A]], double addrspace(13)* [[B]] ++top: ++ %1 = alloca [3 x i64], align 8 ++ %2 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %0, align 8 ++ %3 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %0, i64 1 ++ %4 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %3, align 8 ++ %5 = getelementptr inbounds [3 x i64], [3 x i64]* %1, i64 0, i64 0 ++ store i64 1, i64* %5, align 8 ++ %6 = getelementptr inbounds [3 x i64], [3 x i64]* %1, i64 0, i64 1 ++ %7 = load i64, i64* inttoptr (i64 24 to i64*), align 8 ++ %8 = addrspacecast %jl_value_t addrspace(10)* %4 to %jl_value_t addrspace(11)* ++ %9 = bitcast %jl_value_t addrspace(11)* %8 to double addrspace(13)* addrspace(11)* ++ %10 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %9, align 8 ++ %11 = addrspacecast %jl_value_t addrspace(10)* %2 to %jl_value_t addrspace(11)* ++ %12 = bitcast %jl_value_t addrspace(11)* %11 to double addrspace(13)* addrspace(11)* ++ %13 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %12, align 8 ++ %14 = load i64, i64* %6, align 8 ++ br label %L74 ++ ++L74: ++ %value_phi20 = phi i64 [ 1, %top ], [ %22, %L74 ] ++ %value_phi21 = phi i64 [ 1, %top ], [ %23, %L74 ] ++ %value_phi22 = phi i64 [ 1, %top ], [ %25, %L74 ] ++ %15 = add i64 %value_phi21, -1 ++ %16 = getelementptr inbounds double, double addrspace(13)* %10, i64 %15 ++ %17 = bitcast double addrspace(13)* %16 to i64 addrspace(13)* ++ %18 = load i64, i64 addrspace(13)* %17, align 8 ++ %19 = add i64 %value_phi20, -1 ++ %20 = getelementptr inbounds double, double addrspace(13)* %13, i64 %19 ++ %21 = bitcast double addrspace(13)* %20 to i64 addrspace(13)* ++ store i64 %18, i64 addrspace(13)* %21, align 8 ++ %22 = add i64 %value_phi20, 1 ++ %23 = add i64 %14, %value_phi21 ++ %24 = icmp eq i64 %value_phi22, %7 ++ %25 = add i64 %value_phi22, 1 ++ br i1 %24, label %L94, label %L74 ++ ++L94: ++ ret void ++} +diff --git a/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll b/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll +index 405a47554e4..4285ef0f117 100644 +--- a/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll ++++ b/test/Analysis/LoopAccessAnalysis/reverse-memcheck-bounds.ll +@@ -58,7 +58,7 @@ for.end: ; preds = %for.body + + ; Here it is not obvious what the limits are, since 'step' could be negative. + +-; CHECK: Low: (-1 + (-1 * ((-60001 + (-1 * %a)) umax (-60001 + (40000 * %step) + (-1 * %a))))) ++; CHECK: Low: ((60000 + %a) umin (60000 + (-40000 * %step) + %a)) + ; CHECK: High: (4 + ((60000 + %a) umax (60000 + (-40000 * %step) + %a))) + + define void @g(i64 %step) { +diff --git a/test/Analysis/ScalarEvolution/2008-07-29-SMinExpr.ll b/test/Analysis/ScalarEvolution/2008-07-29-SMinExpr.ll +index 3542ad2a41e..53e024a68fb 100644 +--- a/test/Analysis/ScalarEvolution/2008-07-29-SMinExpr.ll ++++ b/test/Analysis/ScalarEvolution/2008-07-29-SMinExpr.ll +@@ -22,5 +22,5 @@ afterfor: ; preds = %forinc, %entry + ret i32 %j.0.lcssa + } + +-; CHECK: backedge-taken count is (-2147483632 + ((-1 + (-1 * %{{[xy]}})) smax (-1 + (-1 * %{{[xy]}})))) ++; CHECK: backedge-taken count is (-2147483633 + (-1 * (%x smin %y))) + +diff --git a/test/Analysis/ScalarEvolution/min-max-exprs.ll b/test/Analysis/ScalarEvolution/min-max-exprs.ll +index e8c1e33e095..51f72c643cc 100644 +--- a/test/Analysis/ScalarEvolution/min-max-exprs.ll ++++ b/test/Analysis/ScalarEvolution/min-max-exprs.ll +@@ -33,7 +33,7 @@ bb2: ; preds = %bb1 + %tmp9 = select i1 %tmp4, i64 %tmp5, i64 %tmp6 + ; min(N, i+3) + ; CHECK: select i1 %tmp4, i64 %tmp5, i64 %tmp6 +-; CHECK-NEXT: --> (-1 + (-1 * ((-1 + (-1 * (sext i32 {3,+,1}<%bb1> to i64))) smax (-1 + (-1 * (sext i32 %N to i64)))))) ++; CHECK-NEXT: --> ((sext i32 {3,+,1}<%bb1> to i64) smin (sext i32 %N to i64)) + %tmp11 = getelementptr inbounds i32, i32* %A, i64 %tmp9 + %tmp12 = load i32, i32* %tmp11, align 4 + %tmp13 = shl nsw i32 %tmp12, 1 +diff --git a/test/Analysis/ScalarEvolution/pr28705.ll b/test/Analysis/ScalarEvolution/pr28705.ll +index 8fbc08e3ca6..7d797a15bd5 100644 +--- a/test/Analysis/ScalarEvolution/pr28705.ll ++++ b/test/Analysis/ScalarEvolution/pr28705.ll +@@ -5,7 +5,7 @@ + ; with "%.sroa.speculated + 1". + ; + ; CHECK-LABEL: @foo( +-; CHECK: %[[EXIT:.+]] = sub i32 %.sroa.speculated, -1 ++; CHECK: %[[EXIT:.+]] = add i32 %.sroa.speculated, 1 + ; CHECK: %DB.sroa.9.0.lcssa = phi i32 [ 1, %entry ], [ %[[EXIT]], %loopexit ] + ; + define void @foo(i32 %sub.ptr.div.i, i8* %ref.i1174) local_unnamed_addr { +diff --git a/test/Analysis/ScalarEvolution/predicated-trip-count.ll b/test/Analysis/ScalarEvolution/predicated-trip-count.ll +index 2db0a8b5777..b07662ed95f 100644 +--- a/test/Analysis/ScalarEvolution/predicated-trip-count.ll ++++ b/test/Analysis/ScalarEvolution/predicated-trip-count.ll +@@ -80,7 +80,7 @@ return: ; preds = %bb5 + ; CHECK-NEXT: --> (sext i16 {%Start,+,-1}<%bb3> to i32) + ; CHECK: Loop %bb3: Unpredictable backedge-taken count. + ; CHECK-NEXT: Loop %bb3: Unpredictable max backedge-taken count. +-; CHECK-NEXT: Loop %bb3: Predicated backedge-taken count is (2 + (sext i16 %Start to i32) + ((-2 + (-1 * (sext i16 %Start to i32))) smax (-1 + (-1 * %M)))) ++; CHECK-NEXT: Loop %bb3: Predicated backedge-taken count is (1 + (sext i16 %Start to i32) + (-1 * ((1 + (sext i16 %Start to i32)) smin %M))) + ; CHECK-NEXT: Predicates: + ; CHECK-NEXT: {%Start,+,-1}<%bb3> Added Flags: + +diff --git a/test/Analysis/ScalarEvolution/trip-count3.ll b/test/Analysis/ScalarEvolution/trip-count3.ll +index cce0182d649..7f20b4e71be 100644 +--- a/test/Analysis/ScalarEvolution/trip-count3.ll ++++ b/test/Analysis/ScalarEvolution/trip-count3.ll +@@ -4,7 +4,7 @@ + ; dividing by the stride will have a remainder. This could theoretically + ; be teaching it how to use a more elaborate trip count computation. + +-; CHECK: Loop %bb3.i: backedge-taken count is ((64 + (-64 smax (-1 + (-1 * %0))) + %0) /u 64) ++; CHECK: Loop %bb3.i: backedge-taken count is ((63 + (-1 * (63 smin %0)) + %0) /u 64) + ; CHECK: Loop %bb3.i: max backedge-taken count is 33554431 + + %struct.FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct.FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +diff --git a/test/Transforms/IRCE/conjunctive-checks.ll b/test/Transforms/IRCE/conjunctive-checks.ll +index f6a909e432c..d9bf485df3a 100644 +--- a/test/Transforms/IRCE/conjunctive-checks.ll ++++ b/test/Transforms/IRCE/conjunctive-checks.ll +@@ -4,16 +4,6 @@ define void @f_0(i32 *%arr, i32 *%a_len_ptr, i32 %n, i1* %cond_buf) { + ; CHECK-LABEL: @f_0( + + ; CHECK: loop.preheader: +-; CHECK: [[not_n:[^ ]+]] = sub i32 -1, %n +-; CHECK: [[not_safe_range_end:[^ ]+]] = sub i32 3, %len +-; CHECK: [[not_exit_main_loop_at_hiclamp_cmp:[^ ]+]] = icmp sgt i32 [[not_n]], [[not_safe_range_end]] +-; CHECK: [[not_exit_main_loop_at_hiclamp:[^ ]+]] = select i1 [[not_exit_main_loop_at_hiclamp_cmp]], i32 [[not_n]], i32 [[not_safe_range_end]] +-; CHECK: [[exit_main_loop_at_hiclamp:[^ ]+]] = sub i32 -1, [[not_exit_main_loop_at_hiclamp]] +-; CHECK: [[exit_main_loop_at_loclamp_cmp:[^ ]+]] = icmp sgt i32 [[exit_main_loop_at_hiclamp]], 0 +-; CHECK: [[exit_main_loop_at_loclamp:[^ ]+]] = select i1 [[exit_main_loop_at_loclamp_cmp]], i32 [[exit_main_loop_at_hiclamp]], i32 0 +-; CHECK: [[enter_main_loop:[^ ]+]] = icmp slt i32 0, [[exit_main_loop_at_loclamp]] +-; CHECK: br i1 [[enter_main_loop]], label %loop.preheader2, label %main.pseudo.exit +- + ; CHECK: loop.preheader2: + ; CHECK: br label %loop + +@@ -57,14 +47,10 @@ define void @f_1( + ; CHECK-LABEL: @f_1( + + ; CHECK: loop.preheader: +-; CHECK: [[not_len_b:[^ ]+]] = sub i32 -1, %len.b +-; CHECK: [[not_len_a:[^ ]+]] = sub i32 -1, %len.a +-; CHECK: [[smax_not_len_cond:[^ ]+]] = icmp sgt i32 [[not_len_b]], [[not_len_a]] +-; CHECK: [[smax_not_len:[^ ]+]] = select i1 [[smax_not_len_cond]], i32 [[not_len_b]], i32 [[not_len_a]] +-; CHECK: [[not_n:[^ ]+]] = sub i32 -1, %n +-; CHECK: [[not_upper_limit_cond_loclamp:[^ ]+]] = icmp sgt i32 [[smax_not_len]], [[not_n]] +-; CHECK: [[not_upper_limit_loclamp:[^ ]+]] = select i1 [[not_upper_limit_cond_loclamp]], i32 [[smax_not_len]], i32 [[not_n]] +-; CHECK: [[upper_limit_loclamp:[^ ]+]] = sub i32 -1, [[not_upper_limit_loclamp]] ++; CHECK: [[smax_len_cond:[^ ]+]] = icmp slt i32 %len.b, %len.a ++; CHECK: [[smax_len:[^ ]+]] = select i1 [[smax_len_cond]], i32 %len.b, i32 %len.a ++; CHECK: [[upper_limit_cond_loclamp:[^ ]+]] = icmp slt i32 [[smax_len]], %n ++; CHECK: [[upper_limit_loclamp:[^ ]+]] = select i1 [[upper_limit_cond_loclamp]], i32 [[smax_len]], i32 %n + ; CHECK: [[upper_limit_cmp:[^ ]+]] = icmp sgt i32 [[upper_limit_loclamp]], 0 + ; CHECK: [[upper_limit:[^ ]+]] = select i1 [[upper_limit_cmp]], i32 [[upper_limit_loclamp]], i32 0 + +diff --git a/test/Transforms/IRCE/decrementing-loop.ll b/test/Transforms/IRCE/decrementing-loop.ll +index fac873b4a24..30663da9e9f 100644 +--- a/test/Transforms/IRCE/decrementing-loop.ll ++++ b/test/Transforms/IRCE/decrementing-loop.ll +@@ -28,11 +28,8 @@ define void @decrementing_loop(i32 *%arr, i32 *%a_len_ptr, i32 %n) { + ret void + + ; CHECK: loop.preheader: +-; CHECK: [[not_len:[^ ]+]] = sub i32 -1, %len +-; CHECK: [[not_n:[^ ]+]] = sub i32 -1, %n +-; CHECK: [[not_len_hiclamp_cmp:[^ ]+]] = icmp sgt i32 [[not_len]], [[not_n]] +-; CHECK: [[not_len_hiclamp:[^ ]+]] = select i1 [[not_len_hiclamp_cmp]], i32 [[not_len]], i32 [[not_n]] +-; CHECK: [[len_hiclamp:[^ ]+]] = sub i32 -1, [[not_len_hiclamp]] ++; CHECK: [[len_hiclamp_cmp:[^ ]+]] = icmp slt i32 %len, %n ++; CHECK: [[len_hiclamp:[^ ]+]] = select i1 [[len_hiclamp_cmp]], i32 %len, i32 %n + ; CHECK: [[not_exit_preloop_at_cmp:[^ ]+]] = icmp sgt i32 [[len_hiclamp]], 0 + ; CHECK: [[not_exit_preloop_at:[^ ]+]] = select i1 [[not_exit_preloop_at_cmp]], i32 [[len_hiclamp]], i32 0 + ; CHECK: %exit.preloop.at = add i32 [[not_exit_preloop_at]], -1 +diff --git a/test/Transforms/IRCE/multiple-access-no-preloop.ll b/test/Transforms/IRCE/multiple-access-no-preloop.ll +index 31bfe7881b6..e693b1b8ef4 100644 +--- a/test/Transforms/IRCE/multiple-access-no-preloop.ll ++++ b/test/Transforms/IRCE/multiple-access-no-preloop.ll +@@ -37,14 +37,10 @@ define void @multiple_access_no_preloop( + ; CHECK-LABEL: @multiple_access_no_preloop( + + ; CHECK: loop.preheader: +-; CHECK: [[not_len_b:[^ ]+]] = sub i32 -1, %len.b +-; CHECK: [[not_len_a:[^ ]+]] = sub i32 -1, %len.a +-; CHECK: [[smax_not_len_cond:[^ ]+]] = icmp sgt i32 [[not_len_b]], [[not_len_a]] +-; CHECK: [[smax_not_len:[^ ]+]] = select i1 [[smax_not_len_cond]], i32 [[not_len_b]], i32 [[not_len_a]] +-; CHECK: [[not_n:[^ ]+]] = sub i32 -1, %n +-; CHECK: [[not_upper_limit_cond_loclamp:[^ ]+]] = icmp sgt i32 [[smax_not_len]], [[not_n]] +-; CHECK: [[not_upper_limit_loclamp:[^ ]+]] = select i1 [[not_upper_limit_cond_loclamp]], i32 [[smax_not_len]], i32 [[not_n]] +-; CHECK: [[upper_limit_loclamp:[^ ]+]] = sub i32 -1, [[not_upper_limit_loclamp]] ++; CHECK: [[smax_len_cond:[^ ]+]] = icmp slt i32 %len.b, %len.a ++; CHECK: [[smax_len:[^ ]+]] = select i1 [[smax_len_cond]], i32 %len.b, i32 %len.a ++; CHECK: [[upper_limit_cond_loclamp:[^ ]+]] = icmp slt i32 [[smax_len]], %n ++; CHECK: [[upper_limit_loclamp:[^ ]+]] = select i1 [[upper_limit_cond_loclamp]], i32 [[smax_len]], i32 %n + ; CHECK: [[upper_limit_cmp:[^ ]+]] = icmp sgt i32 [[upper_limit_loclamp]], 0 + ; CHECK: [[upper_limit:[^ ]+]] = select i1 [[upper_limit_cmp]], i32 [[upper_limit_loclamp]], i32 0 + +diff --git a/test/Transforms/IRCE/ranges_of_different_types.ll b/test/Transforms/IRCE/ranges_of_different_types.ll +index c38ef24bc18..5694906a4c5 100644 +--- a/test/Transforms/IRCE/ranges_of_different_types.ll ++++ b/test/Transforms/IRCE/ranges_of_different_types.ll +@@ -22,12 +22,11 @@ define void @test_01(i32* %arr, i32* %a_len_ptr) #0 { + ; CHECK-NOT: preloop + ; CHECK: entry: + ; CHECK-NEXT: %len = load i32, i32* %a_len_ptr, !range !0 +-; CHECK-NEXT: [[SUB1:%[^ ]+]] = sub i32 12, %len +-; CHECK-NEXT: [[CMP1:%[^ ]+]] = icmp sgt i32 [[SUB1]], -102 +-; CHECK-NEXT: [[SMAX:%[^ ]+]] = select i1 [[CMP1]], i32 [[SUB1]], i32 -102 +-; CHECK-NEXT: [[SUB2:%[^ ]+]] = sub i32 -1, [[SMAX]] +-; CHECK-NEXT: [[CMP2:%[^ ]+]] = icmp sgt i32 [[SUB2]], 0 +-; CHECK-NEXT: %exit.mainloop.at = select i1 [[CMP2]], i32 [[SUB2]], i32 0 ++; CHECK-NEXT: [[SUB1:%[^ ]+]] = add i32 %len, -13 ++; CHECK-NEXT: [[CMP1:%[^ ]+]] = icmp slt i32 [[SUB1]], 101 ++; CHECK-NEXT: [[SMAX:%[^ ]+]] = select i1 [[CMP1]], i32 [[SUB1]], i32 101 ++; CHECK-NEXT: [[CMP2:%[^ ]+]] = icmp sgt i32 [[SMAX]], 0 ++; CHECK-NEXT: %exit.mainloop.at = select i1 [[CMP2]], i32 [[SMAX]], i32 0 + ; CHECK-NEXT: [[GOTO_LOOP:%[^ ]+]] = icmp slt i32 0, %exit.mainloop.at + ; CHECK-NEXT: br i1 [[GOTO_LOOP]], label %loop.preheader, label %main.pseudo.exit + ; CHECK: loop +@@ -82,13 +81,11 @@ define void @test_02(i32* %arr, i32* %a_len_ptr) #0 { + ; CHECK-NEXT: [[LEN_MINUS_SMAX:%[^ ]+]] = add i32 %len, -2147483647 + ; CHECK-NEXT: [[CMP1:%[^ ]+]] = icmp sgt i32 [[LEN_MINUS_SMAX]], -13 + ; CHECK-NEXT: [[SMAX1:%[^ ]+]] = select i1 [[CMP1]], i32 [[LEN_MINUS_SMAX]], i32 -13 +-; CHECK-NEXT: [[ADD1:%[^ ]+]] = add i32 [[SMAX1]], -1 +-; CHECK-NEXT: [[SUB1:%[^ ]+]] = sub i32 [[ADD1]], %len +-; CHECK-NEXT: [[CMP2:%[^ ]+]] = icmp sgt i32 [[SUB1]], -102 +-; CHECK-NEXT: [[SMAX2:%[^ ]+]] = select i1 [[CMP2]], i32 [[SUB1]], i32 -102 +-; CHECK-NEXT: [[SUB2:%[^ ]+]] = sub i32 -1, [[SMAX2]] +-; CHECK-NEXT: [[CMP3:%[^ ]+]] = icmp sgt i32 [[SUB2]], 0 +-; CHECK-NEXT: %exit.mainloop.at = select i1 [[CMP3]], i32 [[SUB2]], i32 0 ++; CHECK-NEXT: [[SUB1:%[^ ]+]] = sub i32 %len, [[SMAX1]] ++; CHECK-NEXT: [[CMP2:%[^ ]+]] = icmp slt i32 [[SUB1]], 101 ++; CHECK-NEXT: [[SMAX2:%[^ ]+]] = select i1 [[CMP2]], i32 [[SUB1]], i32 101 ++; CHECK-NEXT: [[CMP3:%[^ ]+]] = icmp sgt i32 [[SMAX2]], 0 ++; CHECK-NEXT: %exit.mainloop.at = select i1 [[CMP3]], i32 [[SMAX2]], i32 0 + ; CHECK-NEXT: br i1 true, label %loop.preloop.preheader + ; CHECK: loop.preloop: + ; CHECK-NEXT: %idx.preloop = phi i32 [ %idx.next.preloop, %in.bounds.preloop ], [ 0, %loop.preloop.preheader ] +@@ -150,14 +147,11 @@ define void @test_03(i32* %arr, i32* %a_len_ptr) #0 { + ; CHECK-NOT: preloop + ; CHECK: entry: + ; CHECK-NEXT: %len = load i32, i32* %a_len_ptr, !range !0 +-; CHECK-NEXT: [[SUB1:%[^ ]+]] = sub i32 -2, %len +-; CHECK-NEXT: [[SUB2:%[^ ]+]] = sub i32 -1, %len +-; CHECK-NEXT: [[CMP1:%[^ ]+]] = icmp sgt i32 [[SUB2]], -14 +-; CHECK-NEXT: [[SMAX1:%[^ ]+]] = select i1 [[CMP1]], i32 [[SUB2]], i32 -14 +-; CHECK-NEXT: [[SUB3:%[^ ]+]] = sub i32 [[SUB1]], [[SMAX1]] +-; CHECK-NEXT: [[CMP2:%[^ ]+]] = icmp ugt i32 [[SUB3]], -102 +-; CHECK-NEXT: [[UMAX1:%[^ ]+]] = select i1 [[CMP2]], i32 [[SUB3]], i32 -102 +-; CHECK-NEXT: %exit.mainloop.at = sub i32 -1, [[UMAX1]] ++; CHECK-NEXT: [[CMP1:%[^ ]+]] = icmp slt i32 %len, 13 ++; CHECK-NEXT: [[SMAX1:%[^ ]+]] = select i1 [[CMP1]], i32 %len, i32 13 ++; CHECK-NEXT: [[SUB3:%[^ ]+]] = sub i32 %len, [[SMAX1]] ++; CHECK-NEXT: [[CMP2:%[^ ]+]] = icmp ult i32 [[SUB3]], 101 ++; CHECK-NEXT: %exit.mainloop.at = select i1 [[CMP2]], i32 [[SUB3]], i32 101 + ; CHECK-NEXT: [[CMP3:%[^ ]+]] = icmp ult i32 0, %exit.mainloop.at + ; CHECK-NEXT: br i1 [[CMP3]], label %loop.preheader, label %main.pseudo.exit + ; CHECK: postloop: +@@ -207,10 +201,9 @@ define void @test_04(i32* %arr, i32* %a_len_ptr) #0 { + ; CHECK-LABEL: test_04( + ; CHECK: entry: + ; CHECK-NEXT: %len = load i32, i32* %a_len_ptr, !range !0 +-; CHECK-NEXT: [[SUB1:%[^ ]+]] = sub i32 -14, %len +-; CHECK-NEXT: [[CMP1:%[^ ]+]] = icmp ugt i32 [[SUB1]], -102 +-; CHECK-NEXT: [[UMAX1:%[^ ]+]] = select i1 [[CMP1]], i32 [[SUB1]], i32 -102 +-; CHECK-NEXT: %exit.mainloop.at = sub i32 -1, [[UMAX1]] ++; CHECK-NEXT: [[SUB1:%[^ ]+]] = add i32 %len, 13 ++; CHECK-NEXT: [[CMP1:%[^ ]+]] = icmp ult i32 [[SUB1]], 101 ++; CHECK-NEXT: %exit.mainloop.at = select i1 [[CMP1]], i32 [[SUB1]], i32 101 + ; CHECK-NEXT: br i1 true, label %loop.preloop.preheader + ; CHECK: in.bounds.preloop: + ; CHECK-NEXT: %addr.preloop = getelementptr i32, i32* %arr, i32 %idx.preloop +@@ -251,12 +244,11 @@ define void @test_05(i32* %arr, i32* %a_len_ptr) #0 { + ; CHECK-NOT: preloop + ; CHECK: entry: + ; CHECK-NEXT: %len = load i32, i32* %a_len_ptr, !range !0 +-; CHECK-NEXT: [[SUB1:%[^ ]+]] = sub i32 12, %len +-; CHECK-NEXT: [[CMP1:%[^ ]+]] = icmp sgt i32 [[SUB1]], -102 +-; CHECK-NEXT: [[SMAX:%[^ ]+]] = select i1 [[CMP1]], i32 [[SUB1]], i32 -102 +-; CHECK-NEXT: [[SUB2:%[^ ]+]] = sub i32 -1, [[SMAX]] +-; CHECK-NEXT: [[CMP2:%[^ ]+]] = icmp sgt i32 [[SUB2]], 0 +-; CHECK-NEXT: %exit.mainloop.at = select i1 [[CMP2]], i32 [[SUB2]], i32 0 ++; CHECK-NEXT: [[SUB1:%[^ ]+]] = add i32 %len, -13 ++; CHECK-NEXT: [[CMP1:%[^ ]+]] = icmp slt i32 [[SUB1]], 101 ++; CHECK-NEXT: [[SMAX:%[^ ]+]] = select i1 [[CMP1]], i32 [[SUB1]], i32 101 ++; CHECK-NEXT: [[CMP2:%[^ ]+]] = icmp sgt i32 [[SMAX]], 0 ++; CHECK-NEXT: %exit.mainloop.at = select i1 [[CMP2]], i32 [[SMAX]], i32 0 + ; CHECK-NEXT: [[GOTO_LOOP:%[^ ]+]] = icmp slt i32 0, %exit.mainloop.at + ; CHECK-NEXT: br i1 [[GOTO_LOOP]], label %loop.preheader, label %main.pseudo.exit + ; CHECK: loop +@@ -296,13 +288,11 @@ define void @test_06(i32* %arr, i32* %a_len_ptr) #0 { + ; CHECK-NEXT: [[LEN_MINUS_SMAX:%[^ ]+]] = add i32 %len, -2147483647 + ; CHECK-NEXT: [[CMP1:%[^ ]+]] = icmp sgt i32 [[LEN_MINUS_SMAX]], -13 + ; CHECK-NEXT: [[SMAX1:%[^ ]+]] = select i1 [[CMP1]], i32 [[LEN_MINUS_SMAX]], i32 -13 +-; CHECK-NEXT: [[ADD1:%[^ ]+]] = add i32 [[SMAX1]], -1 +-; CHECK-NEXT: [[SUB1:%[^ ]+]] = sub i32 [[ADD1]], %len +-; CHECK-NEXT: [[CMP2:%[^ ]+]] = icmp sgt i32 [[SUB1]], -102 +-; CHECK-NEXT: [[SMAX2:%[^ ]+]] = select i1 [[CMP2]], i32 [[SUB1]], i32 -102 +-; CHECK-NEXT: [[SUB2:%[^ ]+]] = sub i32 -1, [[SMAX2]] +-; CHECK-NEXT: [[CMP3:%[^ ]+]] = icmp sgt i32 [[SUB2]], 0 +-; CHECK-NEXT: %exit.mainloop.at = select i1 [[CMP3]], i32 [[SUB2]], i32 0 ++; CHECK-NEXT: [[SUB1:%[^ ]+]] = sub i32 %len, [[SMAX1]] ++; CHECK-NEXT: [[CMP2:%[^ ]+]] = icmp slt i32 [[SUB1]], 101 ++; CHECK-NEXT: [[SMAX2:%[^ ]+]] = select i1 [[CMP2]], i32 [[SUB1]], i32 101 ++; CHECK-NEXT: [[CMP3:%[^ ]+]] = icmp sgt i32 [[SMAX2]], 0 ++; CHECK-NEXT: %exit.mainloop.at = select i1 [[CMP3]], i32 [[SMAX2]], i32 0 + ; CHECK-NEXT: br i1 true, label %loop.preloop.preheader + ; CHECK: in.bounds.preloop: + ; CHECK-NEXT: %addr.preloop = getelementptr i32, i32* %arr, i32 %idx.preloop +@@ -343,14 +333,11 @@ define void @test_07(i32* %arr, i32* %a_len_ptr) #0 { + ; CHECK-NOT: preloop + ; CHECK: entry: + ; CHECK-NEXT: %len = load i32, i32* %a_len_ptr, !range !0 +-; CHECK-NEXT: [[SUB1:%[^ ]+]] = sub i32 -2, %len +-; CHECK-NEXT: [[SUB2:%[^ ]+]] = sub i32 -1, %len +-; CHECK-NEXT: [[CMP1:%[^ ]+]] = icmp sgt i32 [[SUB2]], -14 +-; CHECK-NEXT: [[SMAX1:%[^ ]+]] = select i1 [[CMP1]], i32 [[SUB2]], i32 -14 +-; CHECK-NEXT: [[SUB3:%[^ ]+]] = sub i32 [[SUB1]], [[SMAX1]] +-; CHECK-NEXT: [[CMP2:%[^ ]+]] = icmp ugt i32 [[SUB3]], -102 +-; CHECK-NEXT: [[UMAX1:%[^ ]+]] = select i1 [[CMP2]], i32 [[SUB3]], i32 -102 +-; CHECK-NEXT: %exit.mainloop.at = sub i32 -1, [[UMAX1]] ++; CHECK-NEXT: [[CMP1:%[^ ]+]] = icmp slt i32 %len, 13 ++; CHECK-NEXT: [[SMAX1:%[^ ]+]] = select i1 [[CMP1]], i32 %len, i32 13 ++; CHECK-NEXT: [[SUB3:%[^ ]+]] = sub i32 %len, [[SMAX1]] ++; CHECK-NEXT: [[CMP2:%[^ ]+]] = icmp ult i32 [[SUB3]], 101 ++; CHECK-NEXT: %exit.mainloop.at = select i1 [[CMP2]], i32 [[SUB3]], i32 101 + ; CHECK-NEXT: [[CMP3:%[^ ]+]] = icmp ult i32 0, %exit.mainloop.at + ; CHECK-NEXT: br i1 [[CMP3]], label %loop.preheader, label %main.pseudo.exit + ; CHECK: loop +@@ -387,10 +374,9 @@ define void @test_08(i32* %arr, i32* %a_len_ptr) #0 { + ; CHECK-LABEL: test_08( + ; CHECK: entry: + ; CHECK-NEXT: %len = load i32, i32* %a_len_ptr, !range !0 +-; CHECK-NEXT: [[SUB1:%[^ ]+]] = sub i32 -14, %len +-; CHECK-NEXT: [[CMP1:%[^ ]+]] = icmp ugt i32 [[SUB1]], -102 +-; CHECK-NEXT: [[UMAX1:%[^ ]+]] = select i1 [[CMP1]], i32 [[SUB1]], i32 -102 +-; CHECK-NEXT: %exit.mainloop.at = sub i32 -1, [[UMAX1]] ++; CHECK-NEXT: [[SUB1:%[^ ]+]] = add i32 %len, 13 ++; CHECK-NEXT: [[CMP1:%[^ ]+]] = icmp ult i32 [[SUB1]], 101 ++; CHECK-NEXT: %exit.mainloop.at = select i1 [[CMP1]], i32 [[SUB1]], i32 101 + ; CHECK-NEXT: br i1 true, label %loop.preloop.preheader + ; CHECK: in.bounds.preloop: + ; CHECK-NEXT: %addr.preloop = getelementptr i32, i32* %arr, i32 %idx.preloop +diff --git a/test/Transforms/IRCE/single-access-no-preloop.ll b/test/Transforms/IRCE/single-access-no-preloop.ll +index 53f430d0ba3..cbbdf81d46c 100644 +--- a/test/Transforms/IRCE/single-access-no-preloop.ll ++++ b/test/Transforms/IRCE/single-access-no-preloop.ll +@@ -85,11 +85,9 @@ define void @single_access_no_preloop_with_offset(i32 *%arr, i32 *%a_len_ptr, i3 + ; CHECK-LABEL: @single_access_no_preloop_with_offset( + + ; CHECK: loop.preheader: +-; CHECK: [[not_n:[^ ]+]] = sub i32 -1, %n +-; CHECK: [[not_safe_range_end:[^ ]+]] = sub i32 3, %len +-; CHECK: [[not_exit_main_loop_at_hiclamp_cmp:[^ ]+]] = icmp sgt i32 [[not_n]], [[not_safe_range_end]] +-; CHECK: [[not_exit_main_loop_at_hiclamp:[^ ]+]] = select i1 [[not_exit_main_loop_at_hiclamp_cmp]], i32 [[not_n]], i32 [[not_safe_range_end]] +-; CHECK: [[exit_main_loop_at_hiclamp:[^ ]+]] = sub i32 -1, [[not_exit_main_loop_at_hiclamp]] ++; CHECK: [[safe_range_end:[^ ]+]] = add i32 %len, -4 ++; CHECK: [[exit_main_loop_at_hiclamp_cmp:[^ ]+]] = icmp slt i32 %n, [[safe_range_end]] ++; CHECK: [[exit_main_loop_at_hiclamp:[^ ]+]] = select i1 [[exit_main_loop_at_hiclamp_cmp]], i32 %n, i32 [[safe_range_end]] + ; CHECK: [[exit_main_loop_at_loclamp_cmp:[^ ]+]] = icmp sgt i32 [[exit_main_loop_at_hiclamp]], 0 + ; CHECK: [[exit_main_loop_at_loclamp:[^ ]+]] = select i1 [[exit_main_loop_at_loclamp_cmp]], i32 [[exit_main_loop_at_hiclamp]], i32 0 + ; CHECK: [[enter_main_loop:[^ ]+]] = icmp slt i32 0, [[exit_main_loop_at_loclamp]] +diff --git a/test/Transforms/IRCE/single-access-with-preloop.ll b/test/Transforms/IRCE/single-access-with-preloop.ll +index 4b93122b6e7..3e2395dd100 100644 +--- a/test/Transforms/IRCE/single-access-with-preloop.ll ++++ b/test/Transforms/IRCE/single-access-with-preloop.ll +@@ -33,11 +33,9 @@ define void @single_access_with_preloop(i32 *%arr, i32 *%a_len_ptr, i32 %n, i32 + ; CHECK: [[check_min_sint_offset:[^ ]+]] = icmp sgt i32 %offset, -2147483647 + ; CHECK: [[safe_offset_preloop:[^ ]+]] = select i1 [[check_min_sint_offset]], i32 %offset, i32 -2147483647 + ; If Offset was a SINT_MIN, we could have an overflow here. That is why we calculated its safe version. +-; CHECK: [[not_safe_start:[^ ]+]] = add i32 [[safe_offset_preloop]], -1 +-; CHECK: [[not_n:[^ ]+]] = sub i32 -1, %n +-; CHECK: [[not_exit_preloop_at_cond_loclamp:[^ ]+]] = icmp sgt i32 [[not_safe_start]], [[not_n]] +-; CHECK: [[not_exit_preloop_at_loclamp:[^ ]+]] = select i1 [[not_exit_preloop_at_cond_loclamp]], i32 [[not_safe_start]], i32 [[not_n]] +-; CHECK: [[exit_preloop_at_loclamp:[^ ]+]] = sub i32 -1, [[not_exit_preloop_at_loclamp]] ++; CHECK: [[safe_start:[^ ]+]] = sub i32 0, [[safe_offset_preloop]] ++; CHECK: [[exit_preloop_at_cond_loclamp:[^ ]+]] = icmp slt i32 %n, [[safe_start]] ++; CHECK: [[exit_preloop_at_loclamp:[^ ]+]] = select i1 [[exit_preloop_at_cond_loclamp]], i32 %n, i32 [[safe_start]] + ; CHECK: [[exit_preloop_at_cond:[^ ]+]] = icmp sgt i32 [[exit_preloop_at_loclamp]], 0 + ; CHECK: [[exit_preloop_at:[^ ]+]] = select i1 [[exit_preloop_at_cond]], i32 [[exit_preloop_at_loclamp]], i32 0 + +@@ -45,17 +43,15 @@ define void @single_access_with_preloop(i32 *%arr, i32 *%a_len_ptr, i32 %n, i32 + ; CHECK: [[len_minus_sint_max:[^ ]+]] = add i32 %len, -2147483647 + ; CHECK: [[check_len_min_sint_offset:[^ ]+]] = icmp sgt i32 %offset, [[len_minus_sint_max]] + ; CHECK: [[safe_offset_mainloop:[^ ]+]] = select i1 [[check_len_min_sint_offset]], i32 %offset, i32 [[len_minus_sint_max]] +-; CHECK: [[not_safe_start_2:[^ ]+]] = add i32 [[safe_offset_mainloop]], -1 + ; If Offset was a SINT_MIN, we could have an overflow here. That is why we calculated its safe version. +-; CHECK: [[not_safe_upper_end:[^ ]+]] = sub i32 [[not_safe_start_2]], %len +-; CHECK: [[not_exit_mainloop_at_cond_loclamp:[^ ]+]] = icmp sgt i32 [[not_safe_upper_end]], [[not_n]] +-; CHECK: [[not_exit_mainloop_at_loclamp:[^ ]+]] = select i1 [[not_exit_mainloop_at_cond_loclamp]], i32 [[not_safe_upper_end]], i32 [[not_n]] ++; CHECK: [[safe_upper_end:[^ ]+]] = sub i32 %len, [[safe_offset_mainloop]] ++; CHECK: [[exit_mainloop_at_cond_loclamp:[^ ]+]] = icmp slt i32 %n, [[safe_upper_end]] ++; CHECK: [[exit_mainloop_at_loclamp:[^ ]+]] = select i1 [[exit_mainloop_at_cond_loclamp]], i32 %n, i32 [[safe_upper_end]] + ; CHECK: [[check_offset_mainloop_2:[^ ]+]] = icmp sgt i32 %offset, 0 + ; CHECK: [[safe_offset_mainloop_2:[^ ]+]] = select i1 [[check_offset_mainloop_2]], i32 %offset, i32 0 +-; CHECK: [[not_safe_lower_end:[^ ]+]] = add i32 [[safe_offset_mainloop_2]], -2147483648 +-; CHECK: [[not_exit_mainloop_at_cond_hiclamp:[^ ]+]] = icmp sgt i32 [[not_exit_mainloop_at_loclamp]], [[not_safe_lower_end]] +-; CHECK: [[not_exit_mainloop_at_hiclamp:[^ ]+]] = select i1 [[not_exit_mainloop_at_cond_hiclamp]], i32 [[not_exit_mainloop_at_loclamp]], i32 [[not_safe_lower_end]] +-; CHECK: [[exit_mainloop_at_hiclamp:[^ ]+]] = sub i32 -1, [[not_exit_mainloop_at_hiclamp]] ++; CHECK: [[safe_lower_end:[^ ]+]] = sub i32 2147483647, [[safe_offset_mainloop_2]] ++; CHECK: [[exit_mainloop_at_cond_hiclamp:[^ ]+]] = icmp slt i32 [[exit_mainloop_at_loclamp]], [[safe_lower_end]] ++; CHECK: [[exit_mainloop_at_hiclamp:[^ ]+]] = select i1 [[exit_mainloop_at_cond_hiclamp]], i32 [[exit_mainloop_at_loclamp]], i32 [[safe_lower_end]] + ; CHECK: [[exit_mainloop_at_cmp:[^ ]+]] = icmp sgt i32 [[exit_mainloop_at_hiclamp]], 0 + ; CHECK: [[exit_mainloop_at:[^ ]+]] = select i1 [[exit_mainloop_at_cmp]], i32 [[exit_mainloop_at_hiclamp]], i32 0 + +diff --git a/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll b/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll +index ea3f6077231..d5232e1874c 100644 +--- a/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll ++++ b/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll +@@ -14,8 +14,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 + ; current LSR cost model. + ; CHECK-NOT: = ptrtoint i8* undef to i64 + ; CHECK: .lr.ph +-; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp{{[0-9]+}}, -1 +-; CHECK: sub i64 [[TMP]], %tmp{{[0-9]+}} + ; CHECK: ret void + define void @VerifyDiagnosticConsumerTest() unnamed_addr nounwind uwtable align 2 { + bb: diff --git a/patches/llvm-OProfile-line-num.patch b/patches/llvm-OProfile-line-num.patch new file mode 100644 index 0000000..03b2ca8 --- /dev/null +++ b/patches/llvm-OProfile-line-num.patch @@ -0,0 +1,48 @@ +commit 4840cf7299bb312125d41fc84733c15c2370f18e +Author: DokFaust +Date: Fri Jun 8 19:23:01 2018 +0200 + + Add debug line-level code information to OProfile module + +diff --git a/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt b/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt +index 7d5550046a5..ea100286318 100644 +--- a/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt ++++ b/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt +@@ -24 +24 @@ parent = ExecutionEngine +-required_libraries = Support Object ExecutionEngine ++required_libraries = DebugInfoDWARF Support Object ExecutionEngine +diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp +index 3581d645839..045ecb82853 100644 +--- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp ++++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp +@@ -26,0 +27,2 @@ ++#include "llvm/DebugInfo/DIContext.h" ++#include "llvm/DebugInfo/DWARF/DWARFContext.h" +@@ -86,0 +89,2 @@ void OProfileJITEventListener::NotifyObjectEmitted( ++ std::unique_ptr Context = DWARFContext::create(DebugObj); ++ std::string SourceFileName; +@@ -111 +115,23 @@ void OProfileJITEventListener::NotifyObjectEmitted( +- // TODO: support line number info (similar to IntelJITEventListener.cpp) ++ DILineInfoTable Lines = Context->getLineInfoForAddressRange(Addr, Size); ++ DILineInfoTable::iterator Begin = Lines.begin(); ++ DILineInfoTable::iterator End = Lines.end(); ++ size_t i = 0; ++ ++ size_t num_entries = std::distance(Begin, End); ++ static struct debug_line_info* debug_line; ++ debug_line = (struct debug_line_info * )calloc(num_entries, sizeof(struct debug_line_info)); ++ ++ for(DILineInfoTable::iterator It=Begin; It != End; ++It){ ++ i = std::distance(Begin,It); ++ debug_line[i].vma = (unsigned long) It->first; ++ debug_line[i].lineno = It->second.Line; ++ SourceFileName = Lines.front().second.FileName; ++ debug_line[i].filename = const_cast(SourceFileName.c_str()); ++ } ++ ++ if(Wrapper->op_write_debug_line_info((void*) Addr, num_entries, debug_line) == -1) { ++ DEBUG(dbgs() << "Failed to tell OProfiler about debug object at [" ++ << (void*) Addr << "-" << ((char *) Addr + Size) ++ << "]\n"); ++ continue; ++ } diff --git a/patches/llvm-PPC-addrspaces.patch b/patches/llvm-PPC-addrspaces.patch new file mode 100644 index 0000000..7f51b3b --- /dev/null +++ b/patches/llvm-PPC-addrspaces.patch @@ -0,0 +1,29 @@ +From 15899eaab58e96bb7bbe7a14099674e255656a50 Mon Sep 17 00:00:00 2001 +From: Valentin Churavy +Date: Fri, 23 Feb 2018 14:41:20 -0500 +Subject: [PATCH] Make AddrSpaceCast noops on PPC + +PPC as AArch64 doesn't have address-spaces so we can drop them in the backend +--- + lib/Target/PowerPC/PPCISelLowering.h | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h +index e60504507d3..c9b89773968 100644 +--- a/lib/Target/PowerPC/PPCISelLowering.h ++++ b/lib/Target/PowerPC/PPCISelLowering.h +@@ -761,6 +761,11 @@ namespace llvm { + ReuseLoadInfo() : IsInvariant(false), Alignment(0), Ranges(nullptr) {} + }; + ++ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { ++ // Addrspacecasts are always noops. ++ return true; ++ } ++ + bool canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI, + SelectionDAG &DAG, + ISD::LoadExtType ET = ISD::NON_EXTLOAD) const; +-- +2.16.2 + diff --git a/patches/llvm-rL323946-LSRTy.patch b/patches/llvm-rL323946-LSRTy.patch new file mode 100644 index 0000000..ae1a7ac --- /dev/null +++ b/patches/llvm-rL323946-LSRTy.patch @@ -0,0 +1,45 @@ +commit ab60b05a472e8651cbe53c19513b7e62b9ff32df +Author: Mikael Holmen +Date: Thu Feb 1 06:38:34 2018 +0000 + + [LSR] Don't force bases of foldable formulae to the final type. + + Summary: + Before emitting code for scaled registers, we prevent + SCEVExpander from hoisting any scaled addressing mode + by emitting all the bases first. However, these bases + are being forced to the final type, resulting in some + odd code. + + For example, if the type of the base is an integer and + the final type is a pointer, we will emit an inttoptr + for the base, a ptrtoint for the scale, and then a + 'reverse' GEP where the GEP pointer is actually the base + integer and the index is the pointer. It's more intuitive + to use the pointer as a pointer and the integer as index. + + Patch by: Bevin Hansson + + Reviewers: atrick, qcolombet, sanjoy + + Reviewed By: qcolombet + + Subscribers: llvm-commits + + Differential Revision: https://reviews.llvm.org/D42103 + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323946 91177308-0d34-0410-b5e6-96231b3b80d8 + +diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp +index 332c074a1df..4b8e2286ed9 100644 +--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp ++++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp +@@ -4993,7 +4993,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, + // Unless the addressing mode will not be folded. + if (!Ops.empty() && LU.Kind == LSRUse::Address && + isAMCompletelyFolded(TTI, LU, F)) { +- Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty); ++ Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr); + Ops.clear(); + Ops.push_back(SE.getUnknown(FullV)); + } diff --git a/patches/llvm-rL326967-aligned-load.patch b/patches/llvm-rL326967-aligned-load.patch new file mode 100644 index 0000000..62c1123 --- /dev/null +++ b/patches/llvm-rL326967-aligned-load.patch @@ -0,0 +1,301 @@ +commit b398d8e1fa5a5a914957fa22d0a64db97f6c265e +Author: Craig Topper +Date: Thu Mar 8 00:21:17 2018 +0000 + + [X86] Fix some isel patterns that used aligned vector load instructions with unaligned predicates. + + These patterns weren't checking the alignment of the load, but were using the aligned instructions. This will cause a GP fault if the data isn't aligned. + + I believe these were introduced in r312450. + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@326967 91177308-0d34-0410-b5e6-96231b3b80d8 + +diff --git a/lib/Target/X86/X86InstrVecCompiler.td b/lib/Target/X86/X86InstrVecCompiler.td +index db3dfe56531..50c7763a2c3 100644 +--- a/lib/Target/X86/X86InstrVecCompiler.td ++++ b/lib/Target/X86/X86InstrVecCompiler.td +@@ -261,10 +261,10 @@ let Predicates = [HasVLX] in { + // will zero the upper bits. + // TODO: Is there a safe way to detect whether the producing instruction + // already zeroed the upper bits? +-multiclass subvector_zero_lowering { ++multiclass subvector_zero_lowering { + def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)), + (SrcTy RC:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), +@@ -274,91 +274,91 @@ multiclass subvector_zero_lowering("VMOV"#MoveStr#"rm") addr:$src), SubIdx)>; ++ (!cast("VMOV"#LoadStr#"rm") addr:$src), SubIdx)>; + } + + let Predicates = [HasAVX, NoVLX] in { +- defm : subvector_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, loadv2f64, +- sub_xmm>; +- defm : subvector_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, loadv4f32, +- sub_xmm>; +- defm : subvector_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, loadv2i64, +- sub_xmm>; +- defm : subvector_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, loadv2i64, +- sub_xmm>; +- defm : subvector_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, loadv2i64, +- sub_xmm>; +- defm : subvector_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, loadv2i64, +- sub_xmm>; +-} +- +-let Predicates = [HasVLX] in { +- defm : subvector_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32, ++ defm : subvector_zero_lowering<"APD", "UPD", VR128, v4f64, v2f64, v8i32, + loadv2f64, sub_xmm>; +- defm : subvector_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32, ++ defm : subvector_zero_lowering<"APS", "UPS", VR128, v8f32, v4f32, v8i32, + loadv4f32, sub_xmm>; +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32, ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v4i64, v2i64, v8i32, + loadv2i64, sub_xmm>; +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32, ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v8i32, v4i32, v8i32, + loadv2i64, sub_xmm>; +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32, ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v16i16, v8i16, v8i32, + loadv2i64, sub_xmm>; +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32, +- loadv2i64, sub_xmm>; +- +- defm : subvector_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32, +- loadv2f64, sub_xmm>; +- defm : subvector_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32, +- loadv4f32, sub_xmm>; +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32, +- loadv2i64, sub_xmm>; +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32, +- loadv2i64, sub_xmm>; +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32, +- loadv2i64, sub_xmm>; +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32, ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v32i8, v16i8, v8i32, + loadv2i64, sub_xmm>; ++} + +- defm : subvector_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32, +- loadv4f64, sub_ymm>; +- defm : subvector_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32, +- loadv8f32, sub_ymm>; +- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32, +- loadv4i64, sub_ymm>; +- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32, +- loadv4i64, sub_ymm>; +- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32, +- loadv4i64, sub_ymm>; +- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32, +- loadv4i64, sub_ymm>; ++let Predicates = [HasVLX] in { ++ defm : subvector_zero_lowering<"APDZ128", "UPDZ128", VR128X, v4f64, ++ v2f64, v8i32, loadv2f64, sub_xmm>; ++ defm : subvector_zero_lowering<"APSZ128", "UPSZ128", VR128X, v8f32, ++ v4f32, v8i32, loadv4f32, sub_xmm>; ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v4i64, ++ v2i64, v8i32, loadv2i64, sub_xmm>; ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v8i32, ++ v4i32, v8i32, loadv2i64, sub_xmm>; ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v16i16, ++ v8i16, v8i32, loadv2i64, sub_xmm>; ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v32i8, ++ v16i8, v8i32, loadv2i64, sub_xmm>; ++ ++ defm : subvector_zero_lowering<"APDZ128", "UPDZ128", VR128X, v8f64, ++ v2f64, v16i32, loadv2f64, sub_xmm>; ++ defm : subvector_zero_lowering<"APSZ128", "UPSZ128", VR128X, v16f32, ++ v4f32, v16i32, loadv4f32, sub_xmm>; ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v8i64, ++ v2i64, v16i32, loadv2i64, sub_xmm>; ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v16i32, ++ v4i32, v16i32, loadv2i64, sub_xmm>; ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v32i16, ++ v8i16, v16i32, loadv2i64, sub_xmm>; ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v64i8, ++ v16i8, v16i32, loadv2i64, sub_xmm>; ++ ++ defm : subvector_zero_lowering<"APDZ256", "UPDZ256", VR256X, v8f64, ++ v4f64, v16i32, loadv4f64, sub_ymm>; ++ defm : subvector_zero_lowering<"APSZ256", "UPDZ256", VR256X, v16f32, ++ v8f32, v16i32, loadv8f32, sub_ymm>; ++ defm : subvector_zero_lowering<"DQA64Z256", "DQU64Z256", VR256X, v8i64, ++ v4i64, v16i32, loadv4i64, sub_ymm>; ++ defm : subvector_zero_lowering<"DQA64Z256", "DQU64Z256", VR256X, v16i32, ++ v8i32, v16i32, loadv4i64, sub_ymm>; ++ defm : subvector_zero_lowering<"DQA64Z256", "DQU64Z256", VR256X, v32i16, ++ v16i16, v16i32, loadv4i64, sub_ymm>; ++ defm : subvector_zero_lowering<"DQA64Z256", "DQU64Z256", VR256X, v64i8, ++ v32i8, v16i32, loadv4i64, sub_ymm>; + } + + let Predicates = [HasAVX512, NoVLX] in { +- defm : subvector_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, loadv2f64, +- sub_xmm>; +- defm : subvector_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, loadv4f32, +- sub_xmm>; +- defm : subvector_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, loadv2i64, +- sub_xmm>; +- defm : subvector_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, loadv2i64, +- sub_xmm>; +- defm : subvector_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, loadv2i64, +- sub_xmm>; +- defm : subvector_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, loadv2i64, +- sub_xmm>; +- +- defm : subvector_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32, +- loadv4f64, sub_ymm>; +- defm : subvector_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32, +- loadv8f32, sub_ymm>; +- defm : subvector_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32, +- loadv4i64, sub_ymm>; +- defm : subvector_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32, +- loadv4i64, sub_ymm>; +- defm : subvector_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32, +- loadv4i64, sub_ymm>; +- defm : subvector_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32, +- loadv4i64, sub_ymm>; ++ defm : subvector_zero_lowering<"APD", "UPD", VR128, v8f64, v2f64, ++ v16i32,loadv2f64, sub_xmm>; ++ defm : subvector_zero_lowering<"APS", "UPS", VR128, v16f32, v4f32, ++ v16i32, loadv4f32, sub_xmm>; ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v8i64, v2i64, ++ v16i32, loadv2i64, sub_xmm>; ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v16i32, v4i32, ++ v16i32, loadv2i64, sub_xmm>; ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v32i16, v8i16, ++ v16i32, loadv2i64, sub_xmm>; ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v64i8, v16i8, ++ v16i32, loadv2i64, sub_xmm>; ++ ++ defm : subvector_zero_lowering<"APDY", "UPDY", VR256, v8f64, v4f64, ++ v16i32, loadv4f64, sub_ymm>; ++ defm : subvector_zero_lowering<"APSY", "UPSY", VR256, v16f32, v8f32, ++ v16i32, loadv8f32, sub_ymm>; ++ defm : subvector_zero_lowering<"DQAY", "DQUY", VR256, v8i64, v4i64, ++ v16i32, loadv4i64, sub_ymm>; ++ defm : subvector_zero_lowering<"DQAY", "DQUY", VR256, v16i32, v8i32, ++ v16i32, loadv4i64, sub_ymm>; ++ defm : subvector_zero_lowering<"DQAY", "DQUY", VR256, v32i16, v16i16, ++ v16i32, loadv4i64, sub_ymm>; ++ defm : subvector_zero_lowering<"DQAY", "DQUY", VR256, v64i8, v32i8, ++ v16i32, loadv4i64, sub_ymm>; + } + + // List of opcodes that guaranteed to zero the upper elements of vector regs. +diff --git a/test/CodeGen/X86/merge-consecutive-loads-256.ll b/test/CodeGen/X86/merge-consecutive-loads-256.ll +index 6ecd8116443..0f2cf594b1c 100644 +--- a/test/CodeGen/X86/merge-consecutive-loads-256.ll ++++ b/test/CodeGen/X86/merge-consecutive-loads-256.ll +@@ -28,13 +28,13 @@ define <4 x double> @merge_4f64_2f64_23(<2 x double>* %ptr) nounwind uwtable noi + define <4 x double> @merge_4f64_2f64_2z(<2 x double>* %ptr) nounwind uwtable noinline ssp { + ; AVX-LABEL: merge_4f64_2f64_2z: + ; AVX: # %bb.0: +-; AVX-NEXT: vmovaps 32(%rdi), %xmm0 ++; AVX-NEXT: vmovups 32(%rdi), %xmm0 + ; AVX-NEXT: retq + ; + ; X32-AVX-LABEL: merge_4f64_2f64_2z: + ; X32-AVX: # %bb.0: + ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +-; X32-AVX-NEXT: vmovaps 32(%eax), %xmm0 ++; X32-AVX-NEXT: vmovups 32(%eax), %xmm0 + ; X32-AVX-NEXT: retl + %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 + %val0 = load <2 x double>, <2 x double>* %ptr0 +@@ -109,13 +109,13 @@ define <4 x double> @merge_4f64_f64_34uu(double* %ptr) nounwind uwtable noinline + define <4 x double> @merge_4f64_f64_45zz(double* %ptr) nounwind uwtable noinline ssp { + ; AVX-LABEL: merge_4f64_f64_45zz: + ; AVX: # %bb.0: +-; AVX-NEXT: vmovaps 32(%rdi), %xmm0 ++; AVX-NEXT: vmovups 32(%rdi), %xmm0 + ; AVX-NEXT: retq + ; + ; X32-AVX-LABEL: merge_4f64_f64_45zz: + ; X32-AVX: # %bb.0: + ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +-; X32-AVX-NEXT: vmovaps 32(%eax), %xmm0 ++; X32-AVX-NEXT: vmovups 32(%eax), %xmm0 + ; X32-AVX-NEXT: retl + %ptr0 = getelementptr inbounds double, double* %ptr, i64 4 + %ptr1 = getelementptr inbounds double, double* %ptr, i64 5 +@@ -155,13 +155,13 @@ define <4 x double> @merge_4f64_f64_34z6(double* %ptr) nounwind uwtable noinline + define <4 x i64> @merge_4i64_2i64_3z(<2 x i64>* %ptr) nounwind uwtable noinline ssp { + ; AVX-LABEL: merge_4i64_2i64_3z: + ; AVX: # %bb.0: +-; AVX-NEXT: vmovaps 48(%rdi), %xmm0 ++; AVX-NEXT: vmovups 48(%rdi), %xmm0 + ; AVX-NEXT: retq + ; + ; X32-AVX-LABEL: merge_4i64_2i64_3z: + ; X32-AVX: # %bb.0: + ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +-; X32-AVX-NEXT: vmovaps 48(%eax), %xmm0 ++; X32-AVX-NEXT: vmovups 48(%eax), %xmm0 + ; X32-AVX-NEXT: retl + %ptr0 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 3 + %val0 = load <2 x i64>, <2 x i64>* %ptr0 +@@ -217,13 +217,13 @@ define <4 x i64> @merge_4i64_i64_1zzu(i64* %ptr) nounwind uwtable noinline ssp { + define <4 x i64> @merge_4i64_i64_23zz(i64* %ptr) nounwind uwtable noinline ssp { + ; AVX-LABEL: merge_4i64_i64_23zz: + ; AVX: # %bb.0: +-; AVX-NEXT: vmovaps 16(%rdi), %xmm0 ++; AVX-NEXT: vmovups 16(%rdi), %xmm0 + ; AVX-NEXT: retq + ; + ; X32-AVX-LABEL: merge_4i64_i64_23zz: + ; X32-AVX: # %bb.0: + ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +-; X32-AVX-NEXT: vmovaps 16(%eax), %xmm0 ++; X32-AVX-NEXT: vmovups 16(%eax), %xmm0 + ; X32-AVX-NEXT: retl + %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 2 + %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 3 +diff --git a/test/CodeGen/X86/merge-consecutive-loads-512.ll b/test/CodeGen/X86/merge-consecutive-loads-512.ll +index 62102eb382c..3c6eaf65292 100644 +--- a/test/CodeGen/X86/merge-consecutive-loads-512.ll ++++ b/test/CodeGen/X86/merge-consecutive-loads-512.ll +@@ -106,13 +106,13 @@ define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noin + define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noinline ssp { + ; ALL-LABEL: merge_8f64_f64_12zzuuzz: + ; ALL: # %bb.0: +-; ALL-NEXT: vmovaps 8(%rdi), %xmm0 ++; ALL-NEXT: vmovups 8(%rdi), %xmm0 + ; ALL-NEXT: retq + ; + ; X32-AVX512F-LABEL: merge_8f64_f64_12zzuuzz: + ; X32-AVX512F: # %bb.0: + ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax +-; X32-AVX512F-NEXT: vmovaps 8(%eax), %xmm0 ++; X32-AVX512F-NEXT: vmovups 8(%eax), %xmm0 + ; X32-AVX512F-NEXT: retl + %ptr0 = getelementptr inbounds double, double* %ptr, i64 1 + %ptr1 = getelementptr inbounds double, double* %ptr, i64 2 +@@ -190,7 +190,7 @@ define <8 x i64> @merge_8i64_4i64_z3(<4 x i64>* %ptr) nounwind uwtable noinline + define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline ssp { + ; ALL-LABEL: merge_8i64_i64_56zz9uzz: + ; ALL: # %bb.0: +-; ALL-NEXT: vmovaps 40(%rdi), %xmm0 ++; ALL-NEXT: vmovups 40(%rdi), %xmm0 + ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero + ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 + ; ALL-NEXT: retq +@@ -198,7 +198,7 @@ define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline s + ; X32-AVX512F-LABEL: merge_8i64_i64_56zz9uzz: + ; X32-AVX512F: # %bb.0: + ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax +-; X32-AVX512F-NEXT: vmovaps 40(%eax), %xmm0 ++; X32-AVX512F-NEXT: vmovups 40(%eax), %xmm0 + ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero + ; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 + ; X32-AVX512F-NEXT: retl diff --git a/patches/llvm-rL327898.patch b/patches/llvm-rL327898.patch new file mode 100644 index 0000000..f4d9a43 --- /dev/null +++ b/patches/llvm-rL327898.patch @@ -0,0 +1,6131 @@ +commit 64c3384f94a1eb3e3510d6f66c3bccdfc9d9050b +Author: Nirav Dave +Date: Thu Feb 1 16:11:59 2018 +0000 + + r327898/dependencies roll up + + This is a squash of 13 commits required in the lead up to r327898, + which fixes https://github.com/JuliaLang/julia/issues/27603. The squashed + commits are: + + 332d15e981e86b9e058087174bb288ba18a15807 + b659d3fca5d24c25ee73f979edb382f7f24e05e2 + c01d1363ea080170fc5143d72f26eecd9270f03b + eab8a177a4caef9e42ef1d2aeb4ba15dc788d3f2 + bedb1391781b009ace95f5586e7fae5f03fe0689 + 11d041a905f82ac78e7ccf2394773e80b93d147c + e1ec36c55a0127988f42a3329ca835617b30de09 + b8d2903300c13d8fd151c8e5dc71017269617539 + 00884fea345f47ab05174a8f314ecd60d1676d02 + 28ab04cec0d9888af9d29946b3a048b8340abe0f + 3dd52e62ea3087efcca63c3772183d9471abc742 + bd3649ff6d6b4d18b3c6de253179d987a120518a + aea03035b9c633e6d745b6d3fc5b6378699f576c + + Their commit messages follow below: + + [SelectionDAG] Fix UpdateChains handling of TokenFactors + + Summary: + In Instruction Selection UpdateChains replaces all matched Nodes' + chain references including interior token factors and deletes them. + This may allow nodes which depend on these interior nodes but are not + part of the set of matched nodes to be left with a dangling dependence. + Avoid this by doing the replacement for matched non-TokenFactor nodes. + + Fixes PR36164. + + Reviewers: jonpa, RKSimon, bogner + + Subscribers: llvm-commits, hiraditya + + Differential Revision: https://reviews.llvm.org/D42754 + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323977 91177308-0d34-0410-b5e6-96231b3b80d8 + + Regenerate test result for vastart-defs-eflags.ll. NFC. + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323596 91177308-0d34-0410-b5e6-96231b3b80d8 + + Regenerate test result for testb-je-fusion.ll. NFC. + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323595 91177308-0d34-0410-b5e6-96231b3b80d8 + + [X86] Avoid using high register trick for test instruction + + Summary: + It seems it's main effect is to create addition copies when values are inr register that do not support this trick, which increase register pressure and makes the code bigger. + + Reviewers: craig.topper, niravd, spatel, hfinkel + + Subscribers: llvm-commits + + Differential Revision: https://reviews.llvm.org/D42646 + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323888 91177308-0d34-0410-b5e6-96231b3b80d8 + + Add a regression test for problems caused by D42646 . NFC + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323868 91177308-0d34-0410-b5e6-96231b3b80d8 + + Add test case for truncated and promotion to test. NFC + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323663 91177308-0d34-0410-b5e6-96231b3b80d8 + + [X86] Add test case to ensure testw is generated when optimizing for size. NFC + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323687 91177308-0d34-0410-b5e6-96231b3b80d8 + + [X86] Generate testl instruction through truncates. + + Summary: + This was introduced in D42646 but ended up being reverted because the original implementation was buggy. + + Depends on D42646 + + Reviewers: craig.topper, niravd, spatel, hfinkel + + Subscribers: llvm-commits + + Differential Revision: https://reviews.llvm.org/D42741 + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323899 91177308-0d34-0410-b5e6-96231b3b80d8 + + [X86] Don't look for TEST instruction shrinking opportunities when the root node is a X86ISD::SUB. + + I don't believe we ever create an X86ISD::SUB with a 0 constant which is what the TEST handling needs. The ternary operator at the end of this code shows up as only going one way in the llvm-cov report from the bots. + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@324865 91177308-0d34-0410-b5e6-96231b3b80d8 + + [X86] Teach LowerBUILD_VECTOR to recognize pair-wise splats of 32-bit elements and use a 64-bit broadcast + + If we are splatting pairs of 32-bit elements, we can use a 64-bit broadcast to get the job done. + + We could probably could probably do this with other sizes too, for example four 16-bit elements. Or we could broadcast pairs of 16-bit elements using a 32-bit element broadcast. But I've left that as a future improvement. + + I've also restricted this to AVX2 only because we can only broadcast loads under AVX. + + Differential Revision: https://reviews.llvm.org/D42086 + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@322730 91177308-0d34-0410-b5e6-96231b3b80d8 + + [DAG, X86] Revert r327197 "Revert r327170, r327171, r327172" + + Reland ISel cycle checking improvements after simplifying node id + invariant traversal and correcting typo. + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@327898 91177308-0d34-0410-b5e6-96231b3b80d8 + + [ Modified for cherry-pick: Dropped Hexagon and SystemZ changes" + + [DAG, X86] Fix ISel-time node insertion ids + + As in SystemZ backend, correctly propagate node ids when inserting new + unselected nodes into the DAG during instruction Seleciton for X86 + target. + + Fixes PR36865. + + Reviewers: jyknight, craig.topper + + Subscribers: hiraditya, llvm-commits + + Differential Revision: https://reviews.llvm.org/D44797 + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@328233 91177308-0d34-0410-b5e6-96231b3b80d8 + + [DAG] Fix node id invalidation in Instruction Selection. + + Invalidation should be bit negation. Add missing negation. + + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@328287 91177308-0d34-0410-b5e6-96231b3b80d8 + + Remove failing tests + + This removes tests that are failing due to codegen differences, + after the latest set of backports. Fixing thse for the backport + branch does not seem worth it. + +diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h +index de6849a1eae..e56eafc437c 100644 +--- a/include/llvm/CodeGen/SelectionDAGISel.h ++++ b/include/llvm/CodeGen/SelectionDAGISel.h +@@ -110,6 +110,11 @@ public: + CodeGenOpt::Level OptLevel, + bool IgnoreChains = false); + ++ static void InvalidateNodeId(SDNode *N); ++ static int getUninvalidatedNodeId(SDNode *N); ++ ++ static void EnforceNodeIdInvariant(SDNode *N); ++ + // Opcodes used by the DAG state machine: + enum BuiltinOpcodes { + OPC_Scope, +@@ -199,23 +204,28 @@ protected: + /// of the new node T. + void ReplaceUses(SDValue F, SDValue T) { + CurDAG->ReplaceAllUsesOfValueWith(F, T); ++ EnforceNodeIdInvariant(T.getNode()); + } + + /// ReplaceUses - replace all uses of the old nodes F with the use + /// of the new nodes T. + void ReplaceUses(const SDValue *F, const SDValue *T, unsigned Num) { + CurDAG->ReplaceAllUsesOfValuesWith(F, T, Num); ++ for (unsigned i = 0; i < Num; ++i) ++ EnforceNodeIdInvariant(T[i].getNode()); + } + + /// ReplaceUses - replace all uses of the old node F with the use + /// of the new node T. + void ReplaceUses(SDNode *F, SDNode *T) { + CurDAG->ReplaceAllUsesWith(F, T); ++ EnforceNodeIdInvariant(T); + } + + /// Replace all uses of \c F with \c T, then remove \c F from the DAG. + void ReplaceNode(SDNode *F, SDNode *T) { + CurDAG->ReplaceAllUsesWith(F, T); ++ EnforceNodeIdInvariant(T); + CurDAG->RemoveDeadNode(F); + } + +diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h +index 522c2f1b2cb..2d974234abf 100644 +--- a/include/llvm/CodeGen/SelectionDAGNodes.h ++++ b/include/llvm/CodeGen/SelectionDAGNodes.h +@@ -796,16 +796,44 @@ public: + /// searches to be performed in parallel, caching of results across + /// queries and incremental addition to Worklist. Stops early if N is + /// found but will resume. Remember to clear Visited and Worklists +- /// if DAG changes. ++ /// if DAG changes. MaxSteps gives a maximum number of nodes to visit before ++ /// giving up. The TopologicalPrune flag signals that positive NodeIds are ++ /// topologically ordered (Operands have strictly smaller node id) and search ++ /// can be pruned leveraging this. + static bool hasPredecessorHelper(const SDNode *N, + SmallPtrSetImpl &Visited, + SmallVectorImpl &Worklist, +- unsigned int MaxSteps = 0) { ++ unsigned int MaxSteps = 0, ++ bool TopologicalPrune = false) { ++ SmallVector DeferredNodes; + if (Visited.count(N)) + return true; ++ ++ // Node Id's are assigned in three places: As a topological ++ // ordering (> 0), during legalization (results in values set to ++ // 0), new nodes (set to -1). If N has a topolgical id then we ++ // know that all nodes with ids smaller than it cannot be ++ // successors and we need not check them. Filter out all node ++ // that can't be matches. We add them to the worklist before exit ++ // in case of multiple calls. Note that during selection the topological id ++ // may be violated if a node's predecessor is selected before it. We mark ++ // this at selection negating the id of unselected successors and ++ // restricting topological pruning to positive ids. ++ ++ int NId = N->getNodeId(); ++ // If we Invalidated the Id, reconstruct original NId. ++ if (NId < -1) ++ NId = -(NId + 1); ++ ++ bool Found = false; + while (!Worklist.empty()) { + const SDNode *M = Worklist.pop_back_val(); +- bool Found = false; ++ int MId = M->getNodeId(); ++ if (TopologicalPrune && M->getOpcode() != ISD::TokenFactor && (NId > 0) && ++ (MId > 0) && (MId < NId)) { ++ DeferredNodes.push_back(M); ++ continue; ++ } + for (const SDValue &OpV : M->op_values()) { + SDNode *Op = OpV.getNode(); + if (Visited.insert(Op).second) +@@ -814,11 +842,16 @@ public: + Found = true; + } + if (Found) +- return true; ++ break; + if (MaxSteps != 0 && Visited.size() >= MaxSteps) +- return false; ++ break; + } +- return false; ++ // Push deferred nodes back on worklist. ++ Worklist.append(DeferredNodes.begin(), DeferredNodes.end()); ++ // If we bailed early, conservatively return found. ++ if (MaxSteps != 0 && Visited.size() >= MaxSteps) ++ return true; ++ return Found; + } + + /// Return true if all the users of N are contained in Nodes. +diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +index bd9fcfb5c1e..17e42240133 100644 +--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp ++++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +@@ -937,6 +937,58 @@ public: + + } // end anonymous namespace + ++// This function is used to enforce the topological node id property ++// property leveraged during Instruction selection. Before selection all ++// nodes are given a non-negative id such that all nodes have a larger id than ++// their operands. As this holds transitively we can prune checks that a node N ++// is a predecessor of M another by not recursively checking through M's ++// operands if N's ID is larger than M's ID. This is significantly improves ++// performance of for various legality checks (e.g. IsLegalToFold / ++// UpdateChains). ++ ++// However, when we fuse multiple nodes into a single node ++// during selection we may induce a predecessor relationship between inputs and ++// outputs of distinct nodes being merged violating the topological property. ++// Should a fused node have a successor which has yet to be selected, our ++// legality checks would be incorrect. To avoid this we mark all unselected ++// sucessor nodes, i.e. id != -1 as invalid for pruning by bit-negating (x => ++// (-(x+1))) the ids and modify our pruning check to ignore negative Ids of M. ++// We use bit-negation to more clearly enforce that node id -1 can only be ++// achieved by selected nodes). As the conversion is reversable the original Id, ++// topological pruning can still be leveraged when looking for unselected nodes. ++// This method is call internally in all ISel replacement calls. ++void SelectionDAGISel::EnforceNodeIdInvariant(SDNode *Node) { ++ SmallVector Nodes; ++ Nodes.push_back(Node); ++ ++ while (!Nodes.empty()) { ++ SDNode *N = Nodes.pop_back_val(); ++ for (auto *U : N->uses()) { ++ auto UId = U->getNodeId(); ++ if (UId > 0) { ++ InvalidateNodeId(U); ++ Nodes.push_back(U); ++ } ++ } ++ } ++} ++ ++// InvalidateNodeId - As discusses in EnforceNodeIdInvariant, mark a ++// NodeId with the equivalent node id which is invalid for topological ++// pruning. ++void SelectionDAGISel::InvalidateNodeId(SDNode *N) { ++ int InvalidId = -(N->getNodeId() + 1); ++ N->setNodeId(InvalidId); ++} ++ ++// getUninvalidatedNodeId - get original uninvalidated node id. ++int SelectionDAGISel::getUninvalidatedNodeId(SDNode *N) { ++ int Id = N->getNodeId(); ++ if (Id < -1) ++ return -(Id + 1); ++ return Id; ++} ++ + void SelectionDAGISel::DoInstructionSelection() { + DEBUG(dbgs() << "===== Instruction selection begins: " + << printMBBReference(*FuncInfo->MBB) << " '" +@@ -972,6 +1024,33 @@ void SelectionDAGISel::DoInstructionSelection() { + if (Node->use_empty()) + continue; + ++#ifndef NDEBUG ++ SmallVector Nodes; ++ Nodes.push_back(Node); ++ ++ while (!Nodes.empty()) { ++ auto N = Nodes.pop_back_val(); ++ if (N->getOpcode() == ISD::TokenFactor || N->getNodeId() < 0) ++ continue; ++ for (const SDValue &Op : N->op_values()) { ++ if (Op->getOpcode() == ISD::TokenFactor) ++ Nodes.push_back(Op.getNode()); ++ else { ++ // We rely on topological ordering of node ids for checking for ++ // cycles when fusing nodes during selection. All unselected nodes ++ // successors of an already selected node should have a negative id. ++ // This assertion will catch such cases. If this assertion triggers ++ // it is likely you using DAG-level Value/Node replacement functions ++ // (versus equivalent ISEL replacement) in backend-specific ++ // selections. See comment in EnforceNodeIdInvariant for more ++ // details. ++ assert(Op->getNodeId() != -1 && ++ "Node has already selected predecessor node"); ++ } ++ } ++ } ++#endif ++ + // When we are using non-default rounding modes or FP exception behavior + // FP operations are represented by StrictFP pseudo-operations. They + // need to be simplified here so that the target-specific instruction +@@ -2134,52 +2213,44 @@ static SDNode *findGlueUse(SDNode *N) { + return nullptr; + } + +-/// findNonImmUse - Return true if "Use" is a non-immediate use of "Def". +-/// This function iteratively traverses up the operand chain, ignoring +-/// certain nodes. +-static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse, +- SDNode *Root, SmallPtrSetImpl &Visited, ++/// findNonImmUse - Return true if "Def" is a predecessor of "Root" via a path ++/// beyond "ImmedUse". We may ignore chains as they are checked separately. ++static bool findNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse, + bool IgnoreChains) { +- // The NodeID's are given uniques ID's where a node ID is guaranteed to be +- // greater than all of its (recursive) operands. If we scan to a point where +- // 'use' is smaller than the node we're scanning for, then we know we will +- // never find it. +- // +- // The Use may be -1 (unassigned) if it is a newly allocated node. This can +- // happen because we scan down to newly selected nodes in the case of glue +- // uses. +- std::vector WorkList; +- WorkList.push_back(Use); +- +- while (!WorkList.empty()) { +- Use = WorkList.back(); +- WorkList.pop_back(); +- if (Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1) +- continue; ++ SmallPtrSet Visited; ++ SmallVector WorkList; ++ // Only check if we have non-immediate uses of Def. ++ if (ImmedUse->isOnlyUserOf(Def)) ++ return false; + +- // Don't revisit nodes if we already scanned it and didn't fail, we know we +- // won't fail if we scan it again. +- if (!Visited.insert(Use).second) ++ // We don't care about paths to Def that go through ImmedUse so mark it ++ // visited and mark non-def operands as used. ++ Visited.insert(ImmedUse); ++ for (const SDValue &Op : ImmedUse->op_values()) { ++ SDNode *N = Op.getNode(); ++ // Ignore chain deps (they are validated by ++ // HandleMergeInputChains) and immediate uses ++ if ((Op.getValueType() == MVT::Other && IgnoreChains) || N == Def) + continue; ++ if (!Visited.insert(N).second) ++ continue; ++ WorkList.push_back(N); ++ } + +- for (const SDValue &Op : Use->op_values()) { +- // Ignore chain uses, they are validated by HandleMergeInputChains. +- if (Op.getValueType() == MVT::Other && IgnoreChains) +- continue; +- ++ // Initialize worklist to operands of Root. ++ if (Root != ImmedUse) { ++ for (const SDValue &Op : Root->op_values()) { + SDNode *N = Op.getNode(); +- if (N == Def) { +- if (Use == ImmedUse || Use == Root) +- continue; // We are not looking for immediate use. +- assert(N != Root); +- return true; +- } +- +- // Traverse up the operand chain. ++ // Ignore chains (they are validated by HandleMergeInputChains) ++ if ((Op.getValueType() == MVT::Other && IgnoreChains) || N == Def) ++ continue; ++ if (!Visited.insert(N).second) ++ continue; + WorkList.push_back(N); + } + } +- return false; ++ ++ return SDNode::hasPredecessorHelper(Def, Visited, WorkList, 0, true); + } + + /// IsProfitableToFold - Returns true if it's profitable to fold the specific +@@ -2251,13 +2322,12 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root, + + // If our query node has a glue result with a use, we've walked up it. If + // the user (which has already been selected) has a chain or indirectly uses +- // the chain, our WalkChainUsers predicate will not consider it. Because of ++ // the chain, HandleMergeInputChains will not consider it. Because of + // this, we cannot ignore chains in this predicate. + IgnoreChains = false; + } + +- SmallPtrSet Visited; +- return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains); ++ return !findNonImmUse(Root, N.getNode(), U, IgnoreChains); + } + + void SelectionDAGISel::Select_INLINEASM(SDNode *N) { +@@ -2360,7 +2430,8 @@ void SelectionDAGISel::UpdateChains( + std::replace(ChainNodesMatched.begin(), ChainNodesMatched.end(), N, + static_cast(nullptr)); + }); +- CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain); ++ if (ChainNode->getOpcode() != ISD::TokenFactor) ++ ReplaceUses(ChainVal, InputChain); + + // If the node became dead and we haven't already seen it, delete it. + if (ChainNode != NodeToMatch && ChainNode->use_empty() && +@@ -2375,143 +2446,6 @@ void SelectionDAGISel::UpdateChains( + DEBUG(dbgs() << "ISEL: Match complete!\n"); + } + +-enum ChainResult { +- CR_Simple, +- CR_InducesCycle, +- CR_LeadsToInteriorNode +-}; +- +-/// WalkChainUsers - Walk down the users of the specified chained node that is +-/// part of the pattern we're matching, looking at all of the users we find. +-/// This determines whether something is an interior node, whether we have a +-/// non-pattern node in between two pattern nodes (which prevent folding because +-/// it would induce a cycle) and whether we have a TokenFactor node sandwiched +-/// between pattern nodes (in which case the TF becomes part of the pattern). +-/// +-/// The walk we do here is guaranteed to be small because we quickly get down to +-/// already selected nodes "below" us. +-static ChainResult +-WalkChainUsers(const SDNode *ChainedNode, +- SmallVectorImpl &ChainedNodesInPattern, +- DenseMap &TokenFactorResult, +- SmallVectorImpl &InteriorChainedNodes) { +- ChainResult Result = CR_Simple; +- +- for (SDNode::use_iterator UI = ChainedNode->use_begin(), +- E = ChainedNode->use_end(); UI != E; ++UI) { +- // Make sure the use is of the chain, not some other value we produce. +- if (UI.getUse().getValueType() != MVT::Other) continue; +- +- SDNode *User = *UI; +- +- if (User->getOpcode() == ISD::HANDLENODE) // Root of the graph. +- continue; +- +- // If we see an already-selected machine node, then we've gone beyond the +- // pattern that we're selecting down into the already selected chunk of the +- // DAG. +- unsigned UserOpcode = User->getOpcode(); +- if (User->isMachineOpcode() || +- UserOpcode == ISD::CopyToReg || +- UserOpcode == ISD::CopyFromReg || +- UserOpcode == ISD::INLINEASM || +- UserOpcode == ISD::EH_LABEL || +- UserOpcode == ISD::LIFETIME_START || +- UserOpcode == ISD::LIFETIME_END) { +- // If their node ID got reset to -1 then they've already been selected. +- // Treat them like a MachineOpcode. +- if (User->getNodeId() == -1) +- continue; +- } +- +- // If we have a TokenFactor, we handle it specially. +- if (User->getOpcode() != ISD::TokenFactor) { +- // If the node isn't a token factor and isn't part of our pattern, then it +- // must be a random chained node in between two nodes we're selecting. +- // This happens when we have something like: +- // x = load ptr +- // call +- // y = x+4 +- // store y -> ptr +- // Because we structurally match the load/store as a read/modify/write, +- // but the call is chained between them. We cannot fold in this case +- // because it would induce a cycle in the graph. +- if (!std::count(ChainedNodesInPattern.begin(), +- ChainedNodesInPattern.end(), User)) +- return CR_InducesCycle; +- +- // Otherwise we found a node that is part of our pattern. For example in: +- // x = load ptr +- // y = x+4 +- // store y -> ptr +- // This would happen when we're scanning down from the load and see the +- // store as a user. Record that there is a use of ChainedNode that is +- // part of the pattern and keep scanning uses. +- Result = CR_LeadsToInteriorNode; +- InteriorChainedNodes.push_back(User); +- continue; +- } +- +- // If we found a TokenFactor, there are two cases to consider: first if the +- // TokenFactor is just hanging "below" the pattern we're matching (i.e. no +- // uses of the TF are in our pattern) we just want to ignore it. Second, +- // the TokenFactor can be sandwiched in between two chained nodes, like so: +- // [Load chain] +- // ^ +- // | +- // [Load] +- // ^ ^ +- // | \ DAG's like cheese +- // / \ do you? +- // / | +- // [TokenFactor] [Op] +- // ^ ^ +- // | | +- // \ / +- // \ / +- // [Store] +- // +- // In this case, the TokenFactor becomes part of our match and we rewrite it +- // as a new TokenFactor. +- // +- // To distinguish these two cases, do a recursive walk down the uses. +- auto MemoizeResult = TokenFactorResult.find(User); +- bool Visited = MemoizeResult != TokenFactorResult.end(); +- // Recursively walk chain users only if the result is not memoized. +- if (!Visited) { +- auto Res = WalkChainUsers(User, ChainedNodesInPattern, TokenFactorResult, +- InteriorChainedNodes); +- MemoizeResult = TokenFactorResult.insert(std::make_pair(User, Res)).first; +- } +- switch (MemoizeResult->second) { +- case CR_Simple: +- // If the uses of the TokenFactor are just already-selected nodes, ignore +- // it, it is "below" our pattern. +- continue; +- case CR_InducesCycle: +- // If the uses of the TokenFactor lead to nodes that are not part of our +- // pattern that are not selected, folding would turn this into a cycle, +- // bail out now. +- return CR_InducesCycle; +- case CR_LeadsToInteriorNode: +- break; // Otherwise, keep processing. +- } +- +- // Okay, we know we're in the interesting interior case. The TokenFactor +- // is now going to be considered part of the pattern so that we rewrite its +- // uses (it may have uses that are not part of the pattern) with the +- // ultimate chain result of the generated code. We will also add its chain +- // inputs as inputs to the ultimate TokenFactor we create. +- Result = CR_LeadsToInteriorNode; +- if (!Visited) { +- ChainedNodesInPattern.push_back(User); +- InteriorChainedNodes.push_back(User); +- } +- } +- +- return Result; +-} +- + /// HandleMergeInputChains - This implements the OPC_EmitMergeInputChains + /// operation for when the pattern matched at least one node with a chains. The + /// input vector contains a list of all of the chained nodes that we match. We +@@ -2521,47 +2455,56 @@ WalkChainUsers(const SDNode *ChainedNode, + static SDValue + HandleMergeInputChains(SmallVectorImpl &ChainNodesMatched, + SelectionDAG *CurDAG) { +- // Used for memoization. Without it WalkChainUsers could take exponential +- // time to run. +- DenseMap TokenFactorResult; +- // Walk all of the chained nodes we've matched, recursively scanning down the +- // users of the chain result. This adds any TokenFactor nodes that are caught +- // in between chained nodes to the chained and interior nodes list. +- SmallVector InteriorChainedNodes; +- for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) { +- if (WalkChainUsers(ChainNodesMatched[i], ChainNodesMatched, +- TokenFactorResult, +- InteriorChainedNodes) == CR_InducesCycle) +- return SDValue(); // Would induce a cycle. +- } + +- // Okay, we have walked all the matched nodes and collected TokenFactor nodes +- // that we are interested in. Form our input TokenFactor node. ++ SmallPtrSet Visited; ++ SmallVector Worklist; + SmallVector InputChains; +- for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) { +- // Add the input chain of this node to the InputChains list (which will be +- // the operands of the generated TokenFactor) if it's not an interior node. +- SDNode *N = ChainNodesMatched[i]; +- if (N->getOpcode() != ISD::TokenFactor) { +- if (std::count(InteriorChainedNodes.begin(),InteriorChainedNodes.end(),N)) +- continue; ++ unsigned int Max = 8192; + +- // Otherwise, add the input chain. +- SDValue InChain = ChainNodesMatched[i]->getOperand(0); +- assert(InChain.getValueType() == MVT::Other && "Not a chain"); +- InputChains.push_back(InChain); +- continue; +- } ++ // Quick exit on trivial merge. ++ if (ChainNodesMatched.size() == 1) ++ return ChainNodesMatched[0]->getOperand(0); + +- // If we have a token factor, we want to add all inputs of the token factor +- // that are not part of the pattern we're matching. +- for (const SDValue &Op : N->op_values()) { +- if (!std::count(ChainNodesMatched.begin(), ChainNodesMatched.end(), +- Op.getNode())) +- InputChains.push_back(Op); +- } ++ // Add chains that aren't already added (internal). Peek through ++ // token factors. ++ std::function AddChains = [&](const SDValue V) { ++ if (V.getValueType() != MVT::Other) ++ return; ++ if (V->getOpcode() == ISD::EntryToken) ++ return; ++ if (!Visited.insert(V.getNode()).second) ++ return; ++ if (V->getOpcode() == ISD::TokenFactor) { ++ for (const SDValue &Op : V->op_values()) ++ AddChains(Op); ++ } else ++ InputChains.push_back(V); ++ }; ++ ++ for (auto *N : ChainNodesMatched) { ++ Worklist.push_back(N); ++ Visited.insert(N); + } + ++ while (!Worklist.empty()) ++ AddChains(Worklist.pop_back_val()->getOperand(0)); ++ ++ // Skip the search if there are no chain dependencies. ++ if (InputChains.size() == 0) ++ return CurDAG->getEntryNode(); ++ ++ // If one of these chains is a successor of input, we must have a ++ // node that is both the predecessor and successor of the ++ // to-be-merged nodes. Fail. ++ Visited.clear(); ++ for (SDValue V : InputChains) ++ Worklist.push_back(V.getNode()); ++ ++ for (auto *N : ChainNodesMatched) ++ if (SDNode::hasPredecessorHelper(N, Visited, Worklist, Max, true)) ++ return SDValue(); ++ ++ // Return merged chain. + if (InputChains.size() == 1) + return InputChains[0]; + return CurDAG->getNode(ISD::TokenFactor, SDLoc(ChainNodesMatched[0]), +@@ -2606,8 +2549,8 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList, + // Move the glue if needed. + if ((EmitNodeInfo & OPFL_GlueOutput) && OldGlueResultNo != -1 && + (unsigned)OldGlueResultNo != ResNumResults-1) +- CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldGlueResultNo), +- SDValue(Res, ResNumResults-1)); ++ ReplaceUses(SDValue(Node, OldGlueResultNo), ++ SDValue(Res, ResNumResults - 1)); + + if ((EmitNodeInfo & OPFL_GlueOutput) != 0) + --ResNumResults; +@@ -2615,14 +2558,15 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList, + // Move the chain reference if needed. + if ((EmitNodeInfo & OPFL_Chain) && OldChainResultNo != -1 && + (unsigned)OldChainResultNo != ResNumResults-1) +- CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldChainResultNo), +- SDValue(Res, ResNumResults-1)); ++ ReplaceUses(SDValue(Node, OldChainResultNo), ++ SDValue(Res, ResNumResults - 1)); + + // Otherwise, no replacement happened because the node already exists. Replace + // Uses of the old node with the new one. + if (Res != Node) { +- CurDAG->ReplaceAllUsesWith(Node, Res); +- CurDAG->RemoveDeadNode(Node); ++ ReplaceNode(Node, Res); ++ } else { ++ EnforceNodeIdInvariant(Res); + } + + return Res; +@@ -2939,8 +2883,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, + return; + case ISD::AssertSext: + case ISD::AssertZext: +- CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, 0), +- NodeToMatch->getOperand(0)); ++ ReplaceUses(SDValue(NodeToMatch, 0), NodeToMatch->getOperand(0)); + CurDAG->RemoveDeadNode(NodeToMatch); + return; + case ISD::INLINEASM: +@@ -3702,7 +3645,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, + NodeToMatch->getValueType(i).getSizeInBits() == + Res.getValueSizeInBits()) && + "invalid replacement"); +- CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, i), Res); ++ ReplaceUses(SDValue(NodeToMatch, i), Res); + } + + // Update chain uses. +@@ -3715,8 +3658,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, + if (NodeToMatch->getValueType(NodeToMatch->getNumValues() - 1) == + MVT::Glue && + InputGlue.getNode()) +- CurDAG->ReplaceAllUsesOfValueWith( +- SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1), InputGlue); ++ ReplaceUses(SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1), ++ InputGlue); + + assert(NodeToMatch->use_empty() && + "Didn't replace all uses of the node?"); +diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +index f4776adb069..be5345e422d 100644 +--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp ++++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +@@ -759,12 +759,11 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { + + if (ProduceCarry) { + // Replace the carry-use +- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1)); ++ ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1)); + } + + // Replace the remaining uses. +- CurDAG->ReplaceAllUsesWith(N, RegSequence); +- CurDAG->RemoveDeadNode(N); ++ ReplaceNode(N, RegSequence); + } + + void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { +diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp +index 8d32510e200..0f504718f28 100644 +--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp ++++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp +@@ -498,7 +498,7 @@ bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N, + + void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) { + CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode()); +- CurDAG->ReplaceAllUsesWith(N, M); ++ ReplaceUses(N, M); + } + + bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N, +diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +index a6ac4e3df74..3721856ff45 100644 +--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp ++++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +@@ -777,7 +777,7 @@ void HexagonDAGToDAGISel::SelectBitcast(SDNode *N) { + return; + } + +- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N,0), N->getOperand(0)); ++ ReplaceUses(SDValue(N, 0), N->getOperand(0)); + CurDAG->RemoveDeadNode(N); + } + +@@ -2182,4 +2182,3 @@ void HexagonDAGToDAGISel::rebalanceAddressTrees() { + RootHeights.clear(); + RootWeights.clear(); + } +- +diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +index f08c5054065..0608f06ef7e 100644 +--- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp ++++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +@@ -1914,7 +1914,6 @@ void HvxSelector::selectShuffle(SDNode *N) { + // If the mask is all -1's, generate "undef". + if (!UseLeft && !UseRight) { + ISel.ReplaceNode(N, ISel.selectUndef(SDLoc(SN), ResTy).getNode()); +- DAG.RemoveDeadNode(N); + return; + } + +@@ -1970,7 +1969,6 @@ void HvxSelector::selectRor(SDNode *N) { + NewN = DAG.getMachineNode(Hexagon::V6_vror, dl, Ty, {VecV, RotV}); + + ISel.ReplaceNode(N, NewN); +- DAG.RemoveDeadNode(N); + } + + void HexagonDAGToDAGISel::SelectHvxShuffle(SDNode *N) { +@@ -2017,8 +2015,7 @@ void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) { + MemOp[0] = cast(N)->getMemOperand(); + cast(Result)->setMemRefs(MemOp, MemOp + 1); + +- ReplaceUses(N, Result); +- CurDAG->RemoveDeadNode(N); ++ ReplaceNode(N, Result); + } + + void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) { +@@ -2056,8 +2053,7 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) { + MemOp[0] = cast(N)->getMemOperand(); + cast(Result)->setMemRefs(MemOp, MemOp + 1); + +- ReplaceUses(N, Result); +- CurDAG->RemoveDeadNode(N); ++ ReplaceNode(N, Result); + } + + void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) { +@@ -2100,5 +2096,3 @@ void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) { + ReplaceUses(SDValue(N, 1), SDValue(Result, 1)); + CurDAG->RemoveDeadNode(N); + } +- +- +diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +index ce6f3d37f5c..fe59d820c88 100644 +--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp ++++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +@@ -589,10 +589,16 @@ bool SystemZDAGToDAGISel::selectAddress(SDValue Addr, + // The selection DAG must no longer depend on their uniqueness when this + // function is used. + static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) { +- if (N.getNode()->getNodeId() == -1 || +- N.getNode()->getNodeId() > Pos->getNodeId()) { ++ if (N->getNodeId() == -1 || ++ (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) > ++ SelectionDAGISel::getUninvalidatedNodeId(Pos))) { + DAG->RepositionNode(Pos->getIterator(), N.getNode()); +- N.getNode()->setNodeId(Pos->getNodeId()); ++ // Mark Node as invalid for pruning as after this it may be a successor to a ++ // selected node but otherwise be in the same position of Pos. ++ // Conservatively mark it with the same -abs(Id) to assure node id ++ // invariant is preserved. ++ N->setNodeId(Pos->getNodeId()); ++ SelectionDAGISel::InvalidateNodeId(N.getNode()); + } + } + +@@ -1022,8 +1028,7 @@ bool SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) { + }; + SDValue New = convertTo( + DL, VT, SDValue(CurDAG->getMachineNode(Opcode, DL, OpcodeVT, Ops), 0)); +- ReplaceUses(N, New.getNode()); +- CurDAG->RemoveDeadNode(N); ++ ReplaceNode(N, New.getNode()); + return true; + } + +@@ -1114,8 +1119,7 @@ void SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node, + SDValue Lower = CurDAG->getConstant(LowerVal, DL, VT); + SDValue Or = CurDAG->getNode(Opcode, DL, VT, Upper, Lower); + +- ReplaceUses(Node, Or.getNode()); +- CurDAG->RemoveDeadNode(Node); ++ ReplaceNode(Node, Or.getNode()); + + SelectCode(Or.getNode()); + } +diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp +index d79fd0ca4da..ee2d221e31c 100644 +--- a/lib/Target/X86/X86ISelDAGToDAG.cpp ++++ b/lib/Target/X86/X86ISelDAGToDAG.cpp +@@ -988,10 +988,16 @@ bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM, + // IDs! The selection DAG must no longer depend on their uniqueness when this + // is used. + static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { +- if (N.getNode()->getNodeId() == -1 || +- N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) { +- DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode()); +- N.getNode()->setNodeId(Pos.getNode()->getNodeId()); ++ if (N->getNodeId() == -1 || ++ (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) > ++ SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) { ++ DAG.RepositionNode(Pos->getIterator(), N.getNode()); ++ // Mark Node as invalid for pruning as after this it may be a successor to a ++ // selected node but otherwise be in the same position of Pos. ++ // Conservatively mark it with the same -abs(Id) to assure node id ++ // invariant is preserved. ++ N->setNodeId(Pos->getNodeId()); ++ SelectionDAGISel::InvalidateNodeId(N.getNode()); + } + } + +@@ -2092,50 +2098,84 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, + LoadNode->getOffset() != StoreNode->getOffset()) + return false; + +- // Check if the chain is produced by the load or is a TokenFactor with +- // the load output chain as an operand. Return InputChain by reference. ++ bool FoundLoad = false; ++ SmallVector ChainOps; ++ SmallVector LoopWorklist; ++ SmallPtrSet Visited; ++ const unsigned int Max = 1024; ++ ++ // Visualization of Load-Op-Store fusion: ++ // ------------------------- ++ // Legend: ++ // *-lines = Chain operand dependencies. ++ // |-lines = Normal operand dependencies. ++ // Dependencies flow down and right. n-suffix references multiple nodes. ++ // ++ // C Xn C ++ // * * * ++ // * * * ++ // Xn A-LD Yn TF Yn ++ // * * \ | * | ++ // * * \ | * | ++ // * * \ | => A--LD_OP_ST ++ // * * \| \ ++ // TF OP \ ++ // * | \ Zn ++ // * | \ ++ // A-ST Zn ++ // ++ ++ // This merge induced dependences from: #1: Xn -> LD, OP, Zn ++ // #2: Yn -> LD ++ // #3: ST -> Zn ++ ++ // Ensure the transform is safe by checking for the dual ++ // dependencies to make sure we do not induce a loop. ++ ++ // As LD is a predecessor to both OP and ST we can do this by checking: ++ // a). if LD is a predecessor to a member of Xn or Yn. ++ // b). if a Zn is a predecessor to ST. ++ ++ // However, (b) can only occur through being a chain predecessor to ++ // ST, which is the same as Zn being a member or predecessor of Xn, ++ // which is a subset of LD being a predecessor of Xn. So it's ++ // subsumed by check (a). ++ + SDValue Chain = StoreNode->getChain(); + +- bool ChainCheck = false; ++ // Gather X elements in ChainOps. + if (Chain == Load.getValue(1)) { +- ChainCheck = true; +- InputChain = LoadNode->getChain(); ++ FoundLoad = true; ++ ChainOps.push_back(Load.getOperand(0)); + } else if (Chain.getOpcode() == ISD::TokenFactor) { +- SmallVector ChainOps; + for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { + SDValue Op = Chain.getOperand(i); + if (Op == Load.getValue(1)) { +- ChainCheck = true; ++ FoundLoad = true; + // Drop Load, but keep its chain. No cycle check necessary. + ChainOps.push_back(Load.getOperand(0)); + continue; + } +- +- // Make sure using Op as part of the chain would not cause a cycle here. +- // In theory, we could check whether the chain node is a predecessor of +- // the load. But that can be very expensive. Instead visit the uses and +- // make sure they all have smaller node id than the load. +- int LoadId = LoadNode->getNodeId(); +- for (SDNode::use_iterator UI = Op.getNode()->use_begin(), +- UE = UI->use_end(); UI != UE; ++UI) { +- if (UI.getUse().getResNo() != 0) +- continue; +- if (UI->getNodeId() > LoadId) +- return false; +- } +- ++ LoopWorklist.push_back(Op.getNode()); + ChainOps.push_back(Op); + } +- +- if (ChainCheck) +- // Make a new TokenFactor with all the other input chains except +- // for the load. +- InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), +- MVT::Other, ChainOps); + } +- if (!ChainCheck) ++ ++ if (!FoundLoad) ++ return false; ++ ++ // Worklist is currently Xn. Add Yn to worklist. ++ for (SDValue Op : StoredVal->ops()) ++ if (Op.getNode() != LoadNode) ++ LoopWorklist.push_back(Op.getNode()); ++ ++ // Check (a) if Load is a predecessor to Xn + Yn ++ if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, ++ true)) + return false; + ++ InputChain = ++ CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); + return true; + } + +@@ -2335,6 +2375,8 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { + MemOp[1] = LoadNode->getMemOperand(); + Result->setMemRefs(MemOp, MemOp + 2); + ++ // Update Load Chain uses as well. ++ ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); + ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); + ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); + CurDAG->RemoveDeadNode(Node); +@@ -2946,12 +2988,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { + return; + } + +- case X86ISD::CMP: +- case X86ISD::SUB: { +- // Sometimes a SUB is used to perform comparison. +- if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0)) +- // This node is not a CMP. +- break; ++ case X86ISD::CMP: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + +@@ -2971,95 +3008,52 @@ void X86DAGToDAGISel::Select(SDNode *Node) { + if (!C) break; + uint64_t Mask = C->getZExtValue(); + +- // For example, convert "testl %eax, $8" to "testb %al, $8" ++ MVT VT; ++ int SubRegOp; ++ unsigned Op; ++ + if (isUInt<8>(Mask) && + (!(Mask & 0x80) || hasNoSignedComparisonUses(Node))) { +- SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i8); +- SDValue Reg = N0.getOperand(0); +- +- // Extract the l-register. +- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, +- MVT::i8, Reg); +- +- // Emit a testb. +- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, +- Subreg, Imm); +- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has +- // one, do not call ReplaceAllUsesWith. +- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), +- SDValue(NewNode, 0)); +- CurDAG->RemoveDeadNode(Node); +- return; ++ // For example, convert "testl %eax, $8" to "testb %al, $8" ++ VT = MVT::i8; ++ SubRegOp = X86::sub_8bit; ++ Op = X86::TEST8ri; ++ } else if (OptForMinSize && isUInt<16>(Mask) && ++ (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) { ++ // For example, "testl %eax, $32776" to "testw %ax, $32776". ++ // NOTE: We only want to form TESTW instructions if optimizing for ++ // min size. Otherwise we only save one byte and possibly get a length ++ // changing prefix penalty in the decoders. ++ VT = MVT::i16; ++ SubRegOp = X86::sub_16bit; ++ Op = X86::TEST16ri; ++ } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 && ++ (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) { ++ // For example, "testq %rax, $268468232" to "testl %eax, $268468232". ++ // NOTE: We only want to run that transform if N0 is 32 or 64 bits. ++ // Otherwize, we find ourselves in a position where we have to do ++ // promotion. If previous passes did not promote the and, we assume ++ // they had a good reason not to and do not promote here. ++ VT = MVT::i32; ++ SubRegOp = X86::sub_32bit; ++ Op = X86::TEST32ri; ++ } else { ++ // No eligible transformation was found. ++ break; + } + +- // For example, "testl %eax, $2048" to "testb %ah, $8". +- if (isShiftedUInt<8, 8>(Mask) && +- (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) { +- // Shift the immediate right by 8 bits. +- SDValue ShiftedImm = CurDAG->getTargetConstant(Mask >> 8, dl, MVT::i8); +- SDValue Reg = N0.getOperand(0); +- +- // Extract the h-register. +- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, +- MVT::i8, Reg); +- +- // Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only +- // target GR8_NOREX registers, so make sure the register class is +- // forced. +- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl, +- MVT::i32, Subreg, ShiftedImm); +- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has +- // one, do not call ReplaceAllUsesWith. +- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), +- SDValue(NewNode, 0)); +- CurDAG->RemoveDeadNode(Node); +- return; +- } ++ SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT); ++ SDValue Reg = N0.getOperand(0); + +- // For example, "testl %eax, $32776" to "testw %ax, $32776". +- // NOTE: We only want to form TESTW instructions if optimizing for +- // min size. Otherwise we only save one byte and possibly get a length +- // changing prefix penalty in the decoders. +- if (OptForMinSize && isUInt<16>(Mask) && N0.getValueType() != MVT::i16 && +- (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) { +- SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i16); +- SDValue Reg = N0.getOperand(0); +- +- // Extract the 16-bit subregister. +- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, +- MVT::i16, Reg); +- +- // Emit a testw. +- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, +- Subreg, Imm); +- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has +- // one, do not call ReplaceAllUsesWith. +- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), +- SDValue(NewNode, 0)); +- CurDAG->RemoveDeadNode(Node); +- return; +- } ++ // Extract the subregister if necessary. ++ if (N0.getValueType() != VT) ++ Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg); + +- // For example, "testq %rax, $268468232" to "testl %eax, $268468232". +- if (isUInt<32>(Mask) && N0.getValueType() == MVT::i64 && +- (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) { +- SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i32); +- SDValue Reg = N0.getOperand(0); +- +- // Extract the 32-bit subregister. +- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl, +- MVT::i32, Reg); +- +- // Emit a testl. +- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, +- Subreg, Imm); +- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has +- // one, do not call ReplaceAllUsesWith. +- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), +- SDValue(NewNode, 0)); +- CurDAG->RemoveDeadNode(Node); +- return; +- } ++ // Emit a testl or testw. ++ SDNode *NewNode = CurDAG->getMachineNode(Op, dl, MVT::i32, Reg, Imm); ++ // Replace CMP with TEST. ++ ReplaceNode(Node, NewNode); ++ return; + } + break; + } +diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp +index c1ddb771e2f..86e71cba87b 100644 +--- a/lib/Target/X86/X86ISelLowering.cpp ++++ b/lib/Target/X86/X86ISelLowering.cpp +@@ -8131,6 +8131,32 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { + return LD; + } + ++ // If this is a splat of pairs of 32-bit elements, we can use a narrower ++ // build_vector and broadcast it. ++ // TODO: We could probably generalize this more. ++ if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) { ++ SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), ++ DAG.getUNDEF(ExtVT), DAG.getUNDEF(ExtVT) }; ++ auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef Ops) { ++ // Make sure all the even/odd operands match. ++ for (unsigned i = 2; i != NumElems; ++i) ++ if (Ops[i % 2] != Op.getOperand(i)) ++ return false; ++ return true; ++ }; ++ if (CanSplat(Op, NumElems, Ops)) { ++ MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64; ++ MVT NarrowVT = MVT::getVectorVT(ExtVT, 4); ++ // Create a new build vector and cast to v2i64/v2f64. ++ SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2), ++ DAG.getBuildVector(NarrowVT, dl, Ops)); ++ // Broadcast from v2i64/v2f64 and cast to final VT. ++ MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2); ++ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT, ++ NewBV)); ++ } ++ } ++ + // For AVX-length vectors, build the individual 128-bit pieces and use + // shuffles to put them in place. + if (VT.is256BitVector() || VT.is512BitVector()) { +diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td +index 98cc8fb7439..3d5de637da2 100644 +--- a/lib/Target/X86/X86InstrArithmetic.td ++++ b/lib/Target/X86/X86InstrArithmetic.td +@@ -1257,14 +1257,6 @@ let isCompare = 1 in { + def TEST32mi : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>; + let Predicates = [In64BitMode] in + def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>; +- +- // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the +- // register class is constrained to GR8_NOREX. This pseudo is explicitly +- // marked side-effect free, since it doesn't have an isel pattern like +- // other test instructions. +- let isPseudo = 1, hasSideEffects = 0 in +- def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), +- "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; + } // Defs = [EFLAGS] + + def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, +diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp +index 11ada51a870..84a9200a0ef 100644 +--- a/lib/Target/X86/X86InstrInfo.cpp ++++ b/lib/Target/X86/X86InstrInfo.cpp +@@ -7854,9 +7854,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + case X86::VMOVUPSZ256mr_NOVLX: + return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr), + get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); +- case X86::TEST8ri_NOREX: +- MI.setDesc(get(X86::TEST8ri)); +- return true; + case X86::MOV32ri64: + MI.setDesc(get(X86::MOV32ri)); + return true; +diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp +index 67d95c2233d..4e11397dec4 100644 +--- a/lib/Target/X86/X86MacroFusion.cpp ++++ b/lib/Target/X86/X86MacroFusion.cpp +@@ -86,7 +86,6 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, + case X86::TEST16mr: + case X86::TEST32mr: + case X86::TEST64mr: +- case X86::TEST8ri_NOREX: + case X86::AND16i16: + case X86::AND16ri: + case X86::AND16ri8: +diff --git a/test/CodeGen/SystemZ/pr36164.ll b/test/CodeGen/SystemZ/pr36164.ll +new file mode 100644 +index 00000000000..0c850091d31 +--- /dev/null ++++ b/test/CodeGen/SystemZ/pr36164.ll +@@ -0,0 +1,113 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc %s -o - -mtriple=s390x-linux-gnu -mcpu=z13 -disable-basicaa | FileCheck %s ++ ++; This test checks that we do not a reference to a deleted node. ++ ++%0 = type { i32 } ++ ++@g_11 = external dso_local unnamed_addr global i1, align 4 ++@g_69 = external dso_local global i32, align 4 ++@g_73 = external dso_local unnamed_addr global i32, align 4 ++@g_832 = external dso_local constant %0, align 4 ++@g_938 = external dso_local unnamed_addr global i64, align 8 ++ ++; Function Attrs: nounwind ++define void @main() local_unnamed_addr #0 { ++; CHECK-LABEL: main: ++; CHECK: # %bb.0: ++; CHECK-NEXT: stmg %r12, %r15, 96(%r15) ++; CHECK-NEXT: .cfi_offset %r12, -64 ++; CHECK-NEXT: .cfi_offset %r13, -56 ++; CHECK-NEXT: .cfi_offset %r14, -48 ++; CHECK-NEXT: .cfi_offset %r15, -40 ++; CHECK-NEXT: lhi %r0, 1 ++; CHECK-NEXT: larl %r1, g_938 ++; CHECK-NEXT: lhi %r2, 2 ++; CHECK-NEXT: lhi %r3, 3 ++; CHECK-NEXT: lhi %r4, 0 ++; CHECK-NEXT: lhi %r5, 4 ++; CHECK-NEXT: larl %r14, g_11 ++; CHECK-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: strl %r0, g_73 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: strl %r0, g_69 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: lghi %r13, 24 ++; CHECK-NEXT: strl %r2, g_69 ++; CHECK-NEXT: ag %r13, 0(%r1) ++; CHECK-NEXT: lrl %r12, g_832 ++; CHECK-NEXT: strl %r3, g_69 ++; CHECK-NEXT: lrl %r12, g_832 ++; CHECK-NEXT: strl %r4, g_69 ++; CHECK-NEXT: lrl %r12, g_832 ++; CHECK-NEXT: strl %r0, g_69 ++; CHECK-NEXT: lrl %r12, g_832 ++; CHECK-NEXT: strl %r2, g_69 ++; CHECK-NEXT: lrl %r12, g_832 ++; CHECK-NEXT: strl %r3, g_69 ++; CHECK-NEXT: stgrl %r13, g_938 ++; CHECK-NEXT: lrl %r13, g_832 ++; CHECK-NEXT: strl %r5, g_69 ++; CHECK-NEXT: mvi 0(%r14), 1 ++; CHECK-NEXT: j .LBB0_1 ++ br label %1 ++ ++;