From 19487a72e4b470bed95e788e1b2198db641d8671 Mon Sep 17 00:00:00 2001 From: Jonathan Moore Liles Date: Tue, 13 Aug 2013 22:48:42 -0700 Subject: [PATCH] Tweak buffer ops for GCC auto vectorization. --- mixer/src/Chain.C | 8 +-- mixer/src/Meter_Module.C | 20 +------- mixer/src/Module.H | 2 +- nonlib/dsp.C | 83 ++++++++++++++++++++++++------- nonlib/dsp.h | 5 +- timeline/src/Engine/Playback_DS.C | 9 ++-- timeline/src/Engine/Record_DS.C | 14 +++--- wscript | 4 +- 8 files changed, 88 insertions(+), 57 deletions(-) diff --git a/mixer/src/Chain.C b/mixer/src/Chain.C index 52b1123..f461d93 100644 --- a/mixer/src/Chain.C +++ b/mixer/src/Chain.C @@ -176,7 +176,7 @@ Chain::~Chain ( ) client()->lock(); for ( unsigned int i = scratch_port.size(); i--; ) - delete[] (sample_t*)scratch_port[i].buffer(); + free( (sample_t*)scratch_port[i].buffer() ); /* if we leave this up to FLTK, it will happen after we've already destroyed the client */ @@ -376,13 +376,13 @@ Chain::configure_ports ( void ) if ( scratch_port.size() < req_buffers ) { for ( unsigned int i = scratch_port.size(); i--; ) - delete[] (sample_t*)scratch_port[i].buffer(); + free(scratch_port[i].buffer()); scratch_port.clear(); for ( unsigned int i = 0; i < req_buffers; ++i ) { Module::Port p( NULL, Module::Port::OUTPUT, Module::Port::AUDIO ); - p.connect_to( new sample_t[client()->nframes()] ); + p.connect_to( buffer_alloc( client()->nframes() ) ); buffer_fill_with_silence( (sample_t*)p.buffer(), client()->nframes() ); scratch_port.push_back( p ); } @@ -812,7 +812,7 @@ void Chain::buffer_size ( nframes_t nframes ) { for ( unsigned int i = scratch_port.size(); i--; ) - delete[] (sample_t*)scratch_port[i].buffer(); + free(scratch_port[i].buffer()); scratch_port.clear(); configure_ports(); diff --git a/mixer/src/Meter_Module.C b/mixer/src/Meter_Module.C index 6984c91..64aebe1 100644 --- a/mixer/src/Meter_Module.C +++ b/mixer/src/Meter_Module.C @@ -29,6 +29,7 @@ #include "Meter_Module.H" #include "DPM.H" #include "JACK/Port.H" +#include "dsp.h" @@ -170,23 +171,6 @@ Meter_Module::handle ( int m ) /* Engine */ /**********/ -static float -get_peak_sample ( const sample_t* buf, nframes_t nframes ) -{ - float p = 0.0f; - - const sample_t *f = buf; - - for ( int j = nframes; j--; ++f ) - { - const float s = fabs( *f ); - - if ( s > p ) - p = s; - } - - return p; -} void Meter_Module::process ( nframes_t nframes ) @@ -196,7 +180,7 @@ Meter_Module::process ( nframes_t nframes ) if ( audio_input[i].connected() ) { // float dB = 20 * log10( get_peak_sample( (float*)audio_input[i].buffer(), nframes ) / 2.0f ); - float dB = 20 * log10( get_peak_sample( (float*)audio_input[i].buffer(), nframes ) ); + float dB = 20 * log10( buffer_get_peak( (sample_t*) audio_input[i].buffer(), nframes ) ); ((float*)control_output[0].buffer())[i] = dB; if (dB > control_value[i]) diff --git a/mixer/src/Module.H b/mixer/src/Module.H index f71850a..716cc0f 100644 --- a/mixer/src/Module.H +++ b/mixer/src/Module.H @@ -316,7 +316,7 @@ public: LOG_NAME_FUNC( Module ); nframes_t nframes ( void ) const { return _nframes; } - void resize_buffers ( nframes_t v ) { _nframes = v; } + virtual void resize_buffers ( nframes_t v ) { _nframes = v; } int instances ( void ) const { return _instances; } diff --git a/nonlib/dsp.C b/nonlib/dsp.C index 037df51..5548425 100644 --- a/nonlib/dsp.C +++ b/nonlib/dsp.C @@ -21,47 +21,73 @@ #include "dsp.h" #include "string.h" // for memset. +#include -/* TODO: these functions are all targets for optimization (SSE?) */ +static const int ALIGNMENT = 16; + +sample_t * +buffer_alloc ( nframes_t size ) +{ + void *p; + + posix_memalign( &p, ALIGNMENT, size * sizeof( sample_t ) ); + + return (sample_t*)p; +} void -buffer_apply_gain ( sample_t *buf, nframes_t nframes, float g ) +buffer_apply_gain ( sample_t * __restrict__ buf, nframes_t nframes, float g ) { + sample_t * buf_ = (sample_t*) __builtin_assume_aligned(buf,ALIGNMENT); + if ( g != 1.0f ) while ( nframes-- ) - *(buf++) *= g; + *(buf_++) *= g; } void -buffer_apply_gain_buffer ( sample_t *buf, const sample_t *gainbuf, nframes_t nframes ) +buffer_apply_gain_buffer ( sample_t * __restrict__ buf, const sample_t * __restrict__ gainbuf, nframes_t nframes ) { + sample_t * buf_ = (sample_t*) __builtin_assume_aligned(buf,ALIGNMENT); + const sample_t * gainbuf_ = (const sample_t*) __builtin_assume_aligned(gainbuf,ALIGNMENT); + while ( nframes-- ) - *(buf++) *= *(gainbuf++); + *(buf_++) *= *(gainbuf_++); } void -buffer_copy_and_apply_gain_buffer ( sample_t *dst, const sample_t *src, const sample_t *gainbuf, nframes_t nframes ) +buffer_copy_and_apply_gain_buffer ( sample_t * __restrict__ dst, const sample_t * __restrict__ src, const sample_t * __restrict__ gainbuf, nframes_t nframes ) { - while ( nframes-- ) - *(dst++) = *(src++) * *(gainbuf++); + sample_t * dst_ = (sample_t*) __builtin_assume_aligned(dst,ALIGNMENT); + const sample_t * src_ = (const sample_t*) __builtin_assume_aligned(src,ALIGNMENT); + const sample_t * gainbuf_ = (const sample_t*) __builtin_assume_aligned(gainbuf,ALIGNMENT); + + while ( nframes-- ) + *(dst_++) = *(src_++) * *(gainbuf_++); } void -buffer_mix ( sample_t *dst, const sample_t *src, nframes_t nframes ) +buffer_mix ( sample_t * __restrict__ dst, const sample_t * __restrict__ src, nframes_t nframes ) { + sample_t * dst_ = (sample_t*) __builtin_assume_aligned(dst,ALIGNMENT); + const sample_t * src_ = (const sample_t*) __builtin_assume_aligned(src,ALIGNMENT); + while ( nframes-- ) - *(dst++) += *(src++); + *(dst_++) += *(src_++); } void -buffer_mix_with_gain ( sample_t *dst, const sample_t *src, nframes_t nframes, float g ) +buffer_mix_with_gain ( sample_t * __restrict__ dst, const sample_t * __restrict__ src, nframes_t nframes, float g ) { + sample_t * dst_ = (sample_t*) __builtin_assume_aligned(dst,ALIGNMENT); + const sample_t * src_ = (const sample_t*) __builtin_assume_aligned(src,ALIGNMENT); + while ( nframes-- ) - *(dst++) += *(src++) * g; + *(dst_++) += *(src_++) * g; } void -buffer_interleave_one_channel ( sample_t *dst, const sample_t *src, int channel, int channels, nframes_t nframes ) +buffer_interleave_one_channel ( sample_t * __restrict__ dst, const sample_t * __restrict__ src, int channel, int channels, nframes_t nframes ) { dst += channel; @@ -73,7 +99,7 @@ buffer_interleave_one_channel ( sample_t *dst, const sample_t *src, int channel, } void -buffer_interleave_one_channel_and_mix ( sample_t *dst, const sample_t *src, int channel, int channels, nframes_t nframes ) +buffer_interleave_one_channel_and_mix ( sample_t *__restrict__ dst, const sample_t * __restrict__ src, int channel, int channels, nframes_t nframes ) { dst += channel; @@ -85,7 +111,7 @@ buffer_interleave_one_channel_and_mix ( sample_t *dst, const sample_t *src, int } void -buffer_deinterleave_one_channel ( sample_t *dst, const sample_t *src, int channel, int channels, nframes_t nframes ) +buffer_deinterleave_one_channel ( sample_t * __restrict__ dst, const sample_t * __restrict__ src, int channel, int channels, nframes_t nframes ) { src += channel; @@ -115,19 +141,36 @@ buffer_is_digital_black ( sample_t *buf, nframes_t nframes ) return true; } +float +buffer_get_peak ( const sample_t * __restrict__ buf, nframes_t nframes ) +{ + const sample_t * buf_ = (const sample_t*) __builtin_assume_aligned(buf,ALIGNMENT); + + float p = 0.0f; + + while ( nframes-- ) + { + const float s = fabs(*(buf_++)); + p = s > p ? s : p; + } + + return p; +} + void -buffer_copy ( sample_t *dst, const sample_t *src, nframes_t nframes ) +buffer_copy ( sample_t * __restrict__ dst, const sample_t * __restrict__ src, nframes_t nframes ) { memcpy( dst, src, nframes * sizeof( sample_t ) ); } void -buffer_copy_and_apply_gain ( sample_t *dst, const sample_t *src, nframes_t nframes, float gain ) +buffer_copy_and_apply_gain ( sample_t * __restrict__ dst, const sample_t * __restrict__ src, nframes_t nframes, float gain ) { memcpy( dst, src, nframes * sizeof( sample_t ) ); buffer_apply_gain( dst, nframes, gain ); } + void Value_Smoothing_Filter::sample_rate ( nframes_t n ) { @@ -138,8 +181,10 @@ Value_Smoothing_Filter::sample_rate ( nframes_t n ) } bool -Value_Smoothing_Filter::apply( sample_t *dst, nframes_t nframes, float gt ) +Value_Smoothing_Filter::apply( sample_t * __restrict__ dst, nframes_t nframes, float gt ) { + sample_t * dst_ = (sample_t*) __builtin_assume_aligned(dst,ALIGNMENT); + const float a = 0.07f; const float b = 1 + a; @@ -155,7 +200,7 @@ Value_Smoothing_Filter::apply( sample_t *dst, nframes_t nframes, float gt ) { g1 += w * (gm - g1 - a * g2); g2 += w * (g1 - g2); - dst[i] = g2; + dst_[i] = g2; } if ( fabsf( gt - g2 ) < 0.0001f ) diff --git a/nonlib/dsp.h b/nonlib/dsp.h index 19ebc7f..bdb28c8 100644 --- a/nonlib/dsp.h +++ b/nonlib/dsp.h @@ -22,6 +22,8 @@ #include "JACK/Client.H" #include + +sample_t *buffer_alloc ( nframes_t size ); void buffer_apply_gain ( sample_t *buf, nframes_t nframes, float g ); void buffer_apply_gain_buffer ( sample_t *buf, const sample_t *gainbuf, nframes_t nframes ); void buffer_copy_and_apply_gain_buffer ( sample_t *dst, const sample_t *src, const sample_t *gainbuf, nframes_t nframes ); @@ -31,7 +33,8 @@ void buffer_interleave_one_channel ( sample_t *dst, const sample_t *src, int cha void buffer_interleave_one_channel_and_mix ( sample_t *dst, const sample_t *src, int channel, int channels, nframes_t nframes ); void buffer_deinterleave_one_channel ( sample_t *dst, const sample_t *src, int channel, int channels, nframes_t nframes ); void buffer_fill_with_silence ( sample_t *buf, nframes_t nframes ); -bool buffer_is_digital_black ( sample_t *buf, nframes_t nframes ); +bool buffer_is_digital_black ( const sample_t *buf, nframes_t nframes ); +float buffer_get_peak ( const sample_t *buf, nframes_t nframes ); void buffer_copy ( sample_t *dst, const sample_t *src, nframes_t nframes ); void buffer_copy_and_apply_gain ( sample_t *dst, const sample_t *src, nframes_t nframes, float gain ); diff --git a/timeline/src/Engine/Playback_DS.C b/timeline/src/Engine/Playback_DS.C index ab22f51..7a2435b 100644 --- a/timeline/src/Engine/Playback_DS.C +++ b/timeline/src/Engine/Playback_DS.C @@ -119,9 +119,9 @@ Playback_DS::disk_thread ( void ) DMESSAGE( "playback thread running" ); /* buffer to hold the interleaved data returned by the track reader */ - sample_t *buf = new sample_t[ _nframes * channels() * _disk_io_blocks ]; + sample_t *buf = buffer_alloc( _nframes * channels() * _disk_io_blocks ); #ifndef AVOID_UNNECESSARY_COPYING - sample_t *cbuf = new sample_t[ _nframes * _disk_io_blocks ]; + sample_t *cbuf = buffer_alloc( _nframes * _disk_io_blocks ); #endif int blocks_ready = 0; @@ -168,6 +168,7 @@ Playback_DS::disk_thread ( void ) { #ifdef AVOID_UNNECESSARY_COPYING + /* deinterleave direcectly into the ringbuffer to avoid * unnecessary copying */ @@ -217,9 +218,9 @@ done: DMESSAGE( "playback thread terminating" ); - delete[] buf; + free(buf); #ifndef AVOID_UNNECESSARY_COPYING - delete[] cbuf; + free(cbuf); #endif _terminate = false; diff --git a/timeline/src/Engine/Record_DS.C b/timeline/src/Engine/Record_DS.C index 9c5d11b..f0aa9a7 100644 --- a/timeline/src/Engine/Record_DS.C +++ b/timeline/src/Engine/Record_DS.C @@ -77,9 +77,9 @@ Record_DS::disk_thread ( void ) const nframes_t nframes = _nframes * _disk_io_blocks; /* buffer to hold the interleaved data returned by the track reader */ - sample_t *buf = new sample_t[ nframes * channels() ]; + sample_t *buf = buffer_alloc( nframes * channels() ); #ifndef AVOID_UNNECESSARY_COPYING - sample_t *cbuf = new sample_t[ nframes ]; + sample_t *cbuf = buffer_alloc( nframes ); #endif const size_t block_size = nframes * sizeof( sample_t ); @@ -98,7 +98,6 @@ Record_DS::disk_thread ( void ) { #ifdef AVOID_UNNECESSARY_COPYING - /* interleave direcectly from the ringbuffer to avoid * unnecessary copying */ @@ -122,7 +121,6 @@ Record_DS::disk_thread ( void ) const nframes_t f = rbd[ 0 ].len / sizeof( sample_t ); /* do the first half */ - buffer_deinterleave_one_channel( (sample_t*)rbd[ 0 ].buf, buf, i, channels(), f ); buffer_interleave_one_channel( buf, (sample_t*)rbd[ 0 ].buf, i, channels(), f ); assert( rbd[ 1 ].len >= ( nframes - f ) * sizeof( sample_t ) ); @@ -158,7 +156,7 @@ Record_DS::disk_thread ( void ) const size_t block_size = _nframes * sizeof( sample_t ); #ifdef AVOID_UNNECESSARY_COPYING - sample_t *cbuf = new sample_t[ nframes ]; + sample_t *cbuf = buffer_alloc( nframes ); #endif while ( blocks_ready-- > 0 || ( ! sem_trywait( &_blocks ) && errno != EAGAIN ) ) @@ -184,14 +182,14 @@ Record_DS::disk_thread ( void ) } #ifdef AVOID_UNNECESSARY_COPYING - delete[] cbuf; + free(cbuf); #endif } - delete[] buf; + free(buf); #ifndef AVOID_UNNECESSARY_COPYING - delete[] cbuf; + free(cbuf); #endif DMESSAGE( "finalzing capture" ); diff --git a/wscript b/wscript index 8ec772b..b8dd707 100644 --- a/wscript +++ b/wscript @@ -76,8 +76,8 @@ def configure(conf): print('Using SSE optimization') optimization_flags.extend( [ "-msse2", - "-mfpmath=sse", - "-ftree-vectorize" ] ) + "-mfpmath=sse" ] ); + conf.define( 'USE_SSE', 1 ) debug_flags = [ '-O0', '-g3' ]