From 19487a72e4b470bed95e788e1b2198db641d8671 Mon Sep 17 00:00:00 2001
From: Jonathan Moore Liles <j.liles@unix.net>
Date: Tue, 13 Aug 2013 22:48:42 -0700
Subject: [PATCH] Tweak buffer ops for GCC auto vectorization.

---
 mixer/src/Chain.C                 |  8 +--
 mixer/src/Meter_Module.C          | 20 +-------
 mixer/src/Module.H                |  2 +-
 nonlib/dsp.C                      | 83 ++++++++++++++++++++++++-------
 nonlib/dsp.h                      |  5 +-
 timeline/src/Engine/Playback_DS.C |  9 ++--
 timeline/src/Engine/Record_DS.C   | 14 +++---
 wscript                           |  4 +-
 8 files changed, 88 insertions(+), 57 deletions(-)

diff --git a/mixer/src/Chain.C b/mixer/src/Chain.C
index 52b1123..f461d93 100644
--- a/mixer/src/Chain.C
+++ b/mixer/src/Chain.C
@@ -176,7 +176,7 @@ Chain::~Chain ( )
     client()->lock();
 
     for ( unsigned int i = scratch_port.size(); i--; )
-        delete[] (sample_t*)scratch_port[i].buffer();
+        free( (sample_t*)scratch_port[i].buffer() );
     
     /* if we leave this up to FLTK, it will happen after we've
      already destroyed the client */
@@ -376,13 +376,13 @@ Chain::configure_ports ( void )
     if ( scratch_port.size() < req_buffers )
     {
         for ( unsigned int i = scratch_port.size(); i--; )
-            delete[] (sample_t*)scratch_port[i].buffer();
+            free(scratch_port[i].buffer());
         scratch_port.clear();
 
         for ( unsigned int i = 0; i < req_buffers; ++i )
         {
             Module::Port p( NULL, Module::Port::OUTPUT, Module::Port::AUDIO );
-            p.connect_to( new sample_t[client()->nframes()] );
+            p.connect_to( buffer_alloc( client()->nframes() ) );
             buffer_fill_with_silence( (sample_t*)p.buffer(), client()->nframes() );
             scratch_port.push_back( p );
         }
@@ -812,7 +812,7 @@ void
 Chain::buffer_size ( nframes_t nframes )
 {
     for ( unsigned int i = scratch_port.size(); i--; )
-        delete[] (sample_t*)scratch_port[i].buffer();
+        free(scratch_port[i].buffer());
     scratch_port.clear();
 
     configure_ports();
diff --git a/mixer/src/Meter_Module.C b/mixer/src/Meter_Module.C
index 6984c91..64aebe1 100644
--- a/mixer/src/Meter_Module.C
+++ b/mixer/src/Meter_Module.C
@@ -29,6 +29,7 @@
 #include "Meter_Module.H"
 #include "DPM.H"
 #include "JACK/Port.H"
+#include "dsp.h"
 
 
 
@@ -170,23 +171,6 @@ Meter_Module::handle ( int m )
 /* Engine */
 /**********/
 
-static float
-get_peak_sample ( const sample_t* buf, nframes_t nframes )
-{
-    float p = 0.0f;
-
-    const sample_t *f = buf;
-
-    for ( int j = nframes; j--; ++f )
-    {
-        const float s = fabs( *f );
-
-        if ( s > p )
-            p = s;
-    }
-
-    return p;
-}
 
 void
 Meter_Module::process ( nframes_t nframes )
@@ -196,7 +180,7 @@ Meter_Module::process ( nframes_t nframes )
         if ( audio_input[i].connected() )
         {
 //            float dB = 20 * log10( get_peak_sample( (float*)audio_input[i].buffer(), nframes ) / 2.0f );
-            float dB = 20 * log10( get_peak_sample( (float*)audio_input[i].buffer(), nframes ) );
+            float dB = 20 * log10( buffer_get_peak( (sample_t*) audio_input[i].buffer(), nframes ) );
 
             ((float*)control_output[0].buffer())[i] = dB;
             if (dB > control_value[i])
diff --git a/mixer/src/Module.H b/mixer/src/Module.H
index f71850a..716cc0f 100644
--- a/mixer/src/Module.H
+++ b/mixer/src/Module.H
@@ -316,7 +316,7 @@ public:
     LOG_NAME_FUNC( Module );
 
     nframes_t nframes ( void ) const { return _nframes; }
-    void resize_buffers ( nframes_t v ) { _nframes = v; }
+    virtual void resize_buffers ( nframes_t v ) { _nframes = v; }
 
 
     int instances ( void ) const { return _instances; }
diff --git a/nonlib/dsp.C b/nonlib/dsp.C
index 037df51..5548425 100644
--- a/nonlib/dsp.C
+++ b/nonlib/dsp.C
@@ -21,47 +21,73 @@
 
 #include "dsp.h"
 #include "string.h" // for memset.
+#include <stdlib.h> 
 
-/* TODO: these functions are all targets for optimization (SSE?) */
+static const int ALIGNMENT = 16;
+
+sample_t *
+buffer_alloc ( nframes_t size )
+{
+    void *p;
+    
+    posix_memalign( &p, ALIGNMENT, size * sizeof( sample_t ) );
+
+    return (sample_t*)p;
+}
 
 void
-buffer_apply_gain ( sample_t *buf, nframes_t nframes, float g )
+buffer_apply_gain ( sample_t * __restrict__ buf, nframes_t nframes, float g )
 {
+    sample_t * buf_ = (sample_t*) __builtin_assume_aligned(buf,ALIGNMENT);
+	
     if ( g != 1.0f )
         while ( nframes-- )
-            *(buf++) *= g;
+	  *(buf_++) *= g;
 }
 
 void
-buffer_apply_gain_buffer ( sample_t *buf, const sample_t *gainbuf, nframes_t nframes )
+buffer_apply_gain_buffer ( sample_t * __restrict__ buf, const sample_t * __restrict__ gainbuf, nframes_t nframes )
 {
+    sample_t * buf_ = (sample_t*) __builtin_assume_aligned(buf,ALIGNMENT);
+    const sample_t * gainbuf_ = (const sample_t*) __builtin_assume_aligned(gainbuf,ALIGNMENT);
+
     while ( nframes-- )
-        *(buf++) *= *(gainbuf++);
+        *(buf_++) *= *(gainbuf_++);
 }
 
 void
-buffer_copy_and_apply_gain_buffer ( sample_t *dst, const sample_t *src, const sample_t *gainbuf, nframes_t nframes )
+buffer_copy_and_apply_gain_buffer ( sample_t * __restrict__ dst, const sample_t * __restrict__ src, const sample_t * __restrict__ gainbuf, nframes_t nframes )
 {
-    while ( nframes-- )
-        *(dst++) = *(src++) * *(gainbuf++);
+    sample_t * dst_ = (sample_t*) __builtin_assume_aligned(dst,ALIGNMENT);
+    const sample_t * src_ = (const sample_t*) __builtin_assume_aligned(src,ALIGNMENT);
+    const sample_t * gainbuf_ = (const sample_t*) __builtin_assume_aligned(gainbuf,ALIGNMENT);
+    
+	while ( nframes-- )
+        *(dst_++) = *(src_++) * *(gainbuf_++);
 }
 
 void
-buffer_mix ( sample_t *dst, const sample_t *src, nframes_t nframes )
+buffer_mix ( sample_t * __restrict__ dst, const sample_t * __restrict__ src, nframes_t nframes )
 {
+    sample_t * dst_ = (sample_t*) __builtin_assume_aligned(dst,ALIGNMENT);
+    const sample_t * src_ = (const sample_t*) __builtin_assume_aligned(src,ALIGNMENT);
+
     while ( nframes-- )
-        *(dst++) += *(src++);
+        *(dst_++) += *(src_++);
 }
 
 void
-buffer_mix_with_gain ( sample_t *dst, const sample_t *src, nframes_t nframes, float g )
+buffer_mix_with_gain ( sample_t * __restrict__ dst, const sample_t * __restrict__ src, nframes_t nframes, float g )
 {
+    sample_t * dst_ = (sample_t*) __builtin_assume_aligned(dst,ALIGNMENT);
+    const sample_t * src_ = (const sample_t*) __builtin_assume_aligned(src,ALIGNMENT);
+   
     while ( nframes-- )
-        *(dst++) += *(src++) * g;
+        *(dst_++) += *(src_++) * g;
 }
 
 void
-buffer_interleave_one_channel ( sample_t *dst, const sample_t *src, int channel, int channels, nframes_t nframes )
+buffer_interleave_one_channel ( sample_t * __restrict__ dst, const sample_t * __restrict__ src, int channel, int channels, nframes_t nframes )
 {
     dst += channel;
 
@@ -73,7 +99,7 @@ buffer_interleave_one_channel ( sample_t *dst, const sample_t *src, int channel,
 }
 
 void
-buffer_interleave_one_channel_and_mix ( sample_t *dst, const sample_t *src, int channel, int channels, nframes_t nframes )
+buffer_interleave_one_channel_and_mix ( sample_t *__restrict__ dst, const sample_t * __restrict__ src, int channel, int channels, nframes_t nframes )
 {
     dst += channel;
 
@@ -85,7 +111,7 @@ buffer_interleave_one_channel_and_mix ( sample_t *dst, const sample_t *src, int
 }
 
 void
-buffer_deinterleave_one_channel ( sample_t *dst, const sample_t *src, int channel, int channels, nframes_t nframes )
+buffer_deinterleave_one_channel ( sample_t * __restrict__ dst, const sample_t * __restrict__ src, int channel, int channels, nframes_t nframes )
 {
     src += channel;
 
@@ -115,19 +141,36 @@ buffer_is_digital_black ( sample_t *buf, nframes_t nframes )
     return true;
 }
 
+float
+buffer_get_peak ( const sample_t * __restrict__ buf, nframes_t nframes )
+{
+    const sample_t * buf_ = (const sample_t*) __builtin_assume_aligned(buf,ALIGNMENT);
+
+    float p = 0.0f;
+  
+    while ( nframes-- )
+    {
+	const float s = fabs(*(buf_++));
+        p = s > p ? s : p;
+    }
+
+    return p;
+}
+
 void
-buffer_copy ( sample_t *dst, const sample_t *src, nframes_t nframes )
+buffer_copy ( sample_t * __restrict__ dst, const sample_t * __restrict__ src, nframes_t nframes )
 {
     memcpy( dst, src, nframes * sizeof( sample_t ) );
 }
 
 void
-buffer_copy_and_apply_gain ( sample_t *dst, const sample_t *src, nframes_t nframes, float gain )
+buffer_copy_and_apply_gain ( sample_t * __restrict__ dst, const sample_t * __restrict__ src, nframes_t nframes, float gain )
 {
     memcpy( dst, src, nframes * sizeof( sample_t ) );
     buffer_apply_gain( dst, nframes, gain );
 }
 
+
 void
 Value_Smoothing_Filter::sample_rate ( nframes_t n )
 {
@@ -138,8 +181,10 @@ Value_Smoothing_Filter::sample_rate ( nframes_t n )
 }
 
 bool
-Value_Smoothing_Filter::apply( sample_t *dst, nframes_t nframes, float gt )
+Value_Smoothing_Filter::apply( sample_t * __restrict__ dst, nframes_t nframes, float gt )
 {
+    sample_t * dst_ = (sample_t*) __builtin_assume_aligned(dst,ALIGNMENT);
+    
     const float a = 0.07f;
     const float b = 1 + a;
     
@@ -155,7 +200,7 @@ Value_Smoothing_Filter::apply( sample_t *dst, nframes_t nframes, float gt )
     {
         g1 += w * (gm - g1 - a * g2);
         g2 += w * (g1 - g2);
-        dst[i] = g2;
+        dst_[i] = g2;
     }
 
     if ( fabsf( gt - g2 ) < 0.0001f )
diff --git a/nonlib/dsp.h b/nonlib/dsp.h
index 19ebc7f..bdb28c8 100644
--- a/nonlib/dsp.h
+++ b/nonlib/dsp.h
@@ -22,6 +22,8 @@
 #include "JACK/Client.H"
 #include <math.h>
 
+
+sample_t *buffer_alloc ( nframes_t size );
 void buffer_apply_gain ( sample_t *buf, nframes_t nframes, float g );
 void buffer_apply_gain_buffer ( sample_t *buf, const sample_t *gainbuf, nframes_t nframes );
 void buffer_copy_and_apply_gain_buffer ( sample_t *dst, const sample_t *src, const sample_t *gainbuf, nframes_t nframes );
@@ -31,7 +33,8 @@ void buffer_interleave_one_channel ( sample_t *dst, const sample_t *src, int cha
 void buffer_interleave_one_channel_and_mix ( sample_t *dst, const sample_t *src, int channel, int channels, nframes_t nframes );
 void buffer_deinterleave_one_channel ( sample_t *dst, const sample_t *src, int channel, int channels, nframes_t nframes );
 void buffer_fill_with_silence ( sample_t *buf, nframes_t nframes );
-bool buffer_is_digital_black ( sample_t *buf, nframes_t nframes );
+bool buffer_is_digital_black ( const sample_t *buf, nframes_t nframes );
+float buffer_get_peak ( const sample_t *buf, nframes_t nframes );
 void buffer_copy ( sample_t *dst, const sample_t *src, nframes_t nframes );
 void buffer_copy_and_apply_gain ( sample_t *dst, const sample_t *src, nframes_t nframes, float gain );
 
diff --git a/timeline/src/Engine/Playback_DS.C b/timeline/src/Engine/Playback_DS.C
index ab22f51..7a2435b 100644
--- a/timeline/src/Engine/Playback_DS.C
+++ b/timeline/src/Engine/Playback_DS.C
@@ -119,9 +119,9 @@ Playback_DS::disk_thread ( void )
     DMESSAGE( "playback thread running" );
 
     /* buffer to hold the interleaved data returned by the track reader */
-    sample_t *buf = new sample_t[ _nframes * channels() * _disk_io_blocks ];
+    sample_t *buf = buffer_alloc( _nframes * channels() * _disk_io_blocks );
 #ifndef AVOID_UNNECESSARY_COPYING
-    sample_t *cbuf = new sample_t[ _nframes * _disk_io_blocks ];
+    sample_t *cbuf = buffer_alloc( _nframes * _disk_io_blocks );
 #endif
 
     int blocks_ready = 0;
@@ -168,6 +168,7 @@ Playback_DS::disk_thread ( void )
         {
 
 #ifdef AVOID_UNNECESSARY_COPYING
+
             /* deinterleave direcectly into the ringbuffer to avoid
              * unnecessary copying */
 
@@ -217,9 +218,9 @@ done:
 
     DMESSAGE( "playback thread terminating" );
 
-    delete[] buf;
+    free(buf);
 #ifndef AVOID_UNNECESSARY_COPYING
-    delete[] cbuf;
+    free(cbuf);
 #endif
 
     _terminate = false;
diff --git a/timeline/src/Engine/Record_DS.C b/timeline/src/Engine/Record_DS.C
index 9c5d11b..f0aa9a7 100644
--- a/timeline/src/Engine/Record_DS.C
+++ b/timeline/src/Engine/Record_DS.C
@@ -77,9 +77,9 @@ Record_DS::disk_thread ( void )
     const nframes_t nframes = _nframes * _disk_io_blocks;
 
     /* buffer to hold the interleaved data returned by the track reader */
-    sample_t *buf = new sample_t[ nframes * channels() ];
+    sample_t *buf = buffer_alloc( nframes * channels() );
 #ifndef AVOID_UNNECESSARY_COPYING
-    sample_t *cbuf = new sample_t[ nframes ];
+    sample_t *cbuf = buffer_alloc( nframes );
 #endif
 
     const size_t block_size = nframes * sizeof( sample_t );
@@ -98,7 +98,6 @@ Record_DS::disk_thread ( void )
         {
 
 #ifdef AVOID_UNNECESSARY_COPYING
-
             /* interleave direcectly from the ringbuffer to avoid
              * unnecessary copying */
 
@@ -122,7 +121,6 @@ Record_DS::disk_thread ( void )
                 const nframes_t f = rbd[ 0 ].len / sizeof( sample_t );
 
                 /* do the first half */
-                buffer_deinterleave_one_channel( (sample_t*)rbd[ 0 ].buf, buf, i, channels(), f );
                 buffer_interleave_one_channel( buf, (sample_t*)rbd[ 0 ].buf, i, channels(), f );
 
                 assert( rbd[ 1 ].len >= ( nframes - f ) * sizeof( sample_t ) );
@@ -158,7 +156,7 @@ Record_DS::disk_thread ( void )
         const size_t block_size = _nframes * sizeof( sample_t );
 
 #ifdef AVOID_UNNECESSARY_COPYING
-        sample_t *cbuf = new sample_t[ nframes ];
+        sample_t *cbuf = buffer_alloc( nframes );
 #endif
 
         while ( blocks_ready-- > 0 || ( ! sem_trywait( &_blocks ) && errno != EAGAIN ) )
@@ -184,14 +182,14 @@ Record_DS::disk_thread ( void )
         }
 
 #ifdef AVOID_UNNECESSARY_COPYING
-        delete[] cbuf;
+        free(cbuf);
 #endif
 
     }
 
-    delete[] buf;
+    free(buf);
 #ifndef AVOID_UNNECESSARY_COPYING
-    delete[] cbuf;
+    free(cbuf);
 #endif
 
     DMESSAGE( "finalzing capture" );
diff --git a/wscript b/wscript
index 8ec772b..b8dd707 100644
--- a/wscript
+++ b/wscript
@@ -76,8 +76,8 @@ def configure(conf):
         print('Using SSE optimization')
         optimization_flags.extend( [
             "-msse2",
-            "-mfpmath=sse",
-            "-ftree-vectorize" ] )
+            "-mfpmath=sse" ] );
+
         conf.define( 'USE_SSE', 1 )
 
     debug_flags = [ '-O0', '-g3' ]