Use fastblur instead of C

2019-07-13 10:31:52 +08:00 · 2019-07-13 10:31:52 +08:00 · b4f8bae1cc
parent 8cf59d7d00
commit b4f8bae1cc
6 changed files with 344 additions and 360 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1189,7 +1189,6 @@ dependencies = [
 name = "silicon"
 version = "0.1.0"
 dependencies = [
- "cc 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)",
 "clipboard 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "conv 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
 "env_logger 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -37,7 +37,6 @@ features = [ "termcolor", "atty", "humantime" ]

 [build-dependencies]
 syntect = "3.2"
-cc = "1.0.37"

 # [profile.release]
 # lto = true
--- a/build.rs
+++ b/build.rs
@ -27,11 +27,4 @@ fn create_syntax_dump() {
 fn main() {
    create_theme_dump();
    create_syntax_dump();
-
-    cc::Build::new()
-        .file("src/gauss/gauss.c")
-        .cpp(false)
-        .flag("-march=native")
-        .opt_level(2)
-        .compile("gauss");
 }
--- a/src/blur.rs
+++ b/src/blur.rs
@ -1,36 +1,352 @@
-use image::RgbaImage;
-use std::os::raw::c_uchar;
+//! Fast (linear time) implementation of the Gaussian Blur algorithm in Rust
+//!
+//! This file is originally from https://github.com/fschutt/fastblur
+//! Edited by aloxaf <aloxafx@gmail.com> to process RgbaImage

-#[link(name = "gauss")]
-#[link(name = "m")]
-extern "C" {
-    fn GaussianBlurFilter(
-        input: *const c_uchar,
-        output: *mut c_uchar,
-        width: i32,
-        height: i32,
-        stride: i32,
-        sigma: f32,
-    );
-}
+use image::RgbaImage;
+use std::cmp::min;

 pub fn gaussian_blur(image: RgbaImage, sigma: f32) -> RgbaImage {
    let (width, height) = image.dimensions();
-    let stride = 4 * width;
-    let raw = image.as_flat_samples();
-    //let raw = image.into_raw();
-    let mut out = raw.samples.to_owned();
+    let mut raw = image.into_raw();
+    let len = raw.len();

    unsafe {
-        GaussianBlurFilter(
-            raw.samples.as_ptr(),
-            out.as_mut_ptr(),
-            width as i32,
-            height as i32,
-            stride as i32,
-            sigma,
-        );
+        raw.set_len(len / 4);
+
+        let ptr = &mut *(&mut raw as *mut Vec<u8> as *mut Vec<[u8; 4]>);
+        gaussian_blur_impl(ptr, width as usize, height as usize, sigma);
+
+        raw.set_len(len);
    }

-    RgbaImage::from_raw(width, height, out).unwrap()
+    RgbaImage::from_raw(width, height, raw).unwrap()
+}
+
+pub fn gaussian_blur_impl(data: &mut Vec<[u8; 4]>, width: usize, height: usize, blur_radius: f32) {
+    let bxs = create_box_gauss(blur_radius, 3);
+    let mut backbuf = data.clone();
+
+    box_blur(
+        &mut backbuf,
+        data,
+        width,
+        height,
+        ((bxs[0] - 1) / 2) as usize,
+    );
+    box_blur(
+        &mut backbuf,
+        data,
+        width,
+        height,
+        ((bxs[1] - 1) / 2) as usize,
+    );
+    box_blur(
+        &mut backbuf,
+        data,
+        width,
+        height,
+        ((bxs[2] - 1) / 2) as usize,
+    );
+}
+
+#[inline]
+fn create_box_gauss(sigma: f32, n: usize) -> Vec<i32> {
+    let n_float = n as f32;
+
+    // Ideal averaging filter width
+    let w_ideal = (12.0 * sigma * sigma / n_float).sqrt() + 1.0;
+    let mut wl: i32 = w_ideal.floor() as i32;
+
+    if wl % 2 == 0 {
+        wl -= 1;
+    };
+
+    let wu = wl + 2;
+
+    let wl_float = wl as f32;
+    let m_ideal = (12.0 * sigma * sigma
+        - n_float * wl_float * wl_float
+        - 4.0 * n_float * wl_float
+        - 3.0 * n_float)
+        / (-4.0 * wl_float - 4.0);
+    let m: usize = m_ideal.round() as usize;
+
+    let mut sizes = Vec::<i32>::new();
+
+    for i in 0..n {
+        if i < m {
+            sizes.push(wl);
+        } else {
+            sizes.push(wu);
+        }
+    }
+
+    sizes
+}
+
+/// Needs 2x the same image
+#[inline]
+fn box_blur(
+    backbuf: &mut Vec<[u8; 4]>,
+    frontbuf: &mut Vec<[u8; 4]>,
+    width: usize,
+    height: usize,
+    blur_radius: usize,
+) {
+    box_blur_horz(backbuf, frontbuf, width, height, blur_radius);
+    box_blur_vert(frontbuf, backbuf, width, height, blur_radius);
+}
+
+#[inline]
+fn box_blur_vert(
+    backbuf: &[[u8; 4]],
+    frontbuf: &mut [[u8; 4]],
+    width: usize,
+    height: usize,
+    blur_radius: usize,
+) {
+    let iarr = 1.0 / (blur_radius + blur_radius + 1) as f32;
+
+    for i in 0..width {
+        let col_start = i; //inclusive
+        let col_end = i + width * (height - 1); //inclusive
+        let mut ti: usize = i;
+        let mut li: usize = ti;
+        let mut ri: usize = ti + blur_radius * width;
+
+        let fv: [u8; 4] = backbuf[col_start];
+        let lv: [u8; 4] = backbuf[col_end];
+
+        let mut val_r: isize = (blur_radius as isize + 1) * isize::from(fv[0]);
+        let mut val_g: isize = (blur_radius as isize + 1) * isize::from(fv[1]);
+        let mut val_b: isize = (blur_radius as isize + 1) * isize::from(fv[2]);
+        let mut val_a: isize = (blur_radius as isize + 1) * isize::from(fv[3]);
+
+        // Get the pixel at the specified index, or the first pixel of the column
+        // if the index is beyond the top edge of the image
+        let get_top = |i: usize| {
+            if i < col_start {
+                fv
+            } else {
+                backbuf[i]
+            }
+        };
+
+        // Get the pixel at the specified index, or the last pixel of the column
+        // if the index is beyond the bottom edge of the image
+        let get_bottom = |i: usize| {
+            if i > col_end {
+                lv
+            } else {
+                backbuf[i]
+            }
+        };
+
+        for j in 0..min(blur_radius, height) {
+            let bb = backbuf[ti + j * width];
+            val_r += isize::from(bb[0]);
+            val_g += isize::from(bb[1]);
+            val_b += isize::from(bb[2]);
+            val_a += isize::from(bb[3]);
+        }
+        if blur_radius > height {
+            val_r += (blur_radius - height) as isize * isize::from(lv[0]);
+            val_g += (blur_radius - height) as isize * isize::from(lv[1]);
+            val_b += (blur_radius - height) as isize * isize::from(lv[2]);
+            val_a += (blur_radius - height) as isize * isize::from(lv[3]);
+        }
+
+        for _ in 0..min(height, blur_radius + 1) {
+            let bb = get_bottom(ri);
+            ri += width;
+            val_r += isize::from(bb[0]) - isize::from(fv[0]);
+            val_g += isize::from(bb[1]) - isize::from(fv[1]);
+            val_b += isize::from(bb[2]) - isize::from(fv[2]);
+            val_a += isize::from(bb[3]) - isize::from(fv[3]);
+
+            frontbuf[ti] = [
+                round(val_r as f32 * iarr) as u8,
+                round(val_g as f32 * iarr) as u8,
+                round(val_b as f32 * iarr) as u8,
+                round(val_a as f32 * iarr) as u8,
+            ];
+            ti += width;
+        }
+
+        if height > blur_radius {
+            // otherwise `(height - blur_radius)` will underflow
+            for _ in (blur_radius + 1)..(height - blur_radius) {
+                let bb1 = backbuf[ri];
+                ri += width;
+                let bb2 = backbuf[li];
+                li += width;
+
+                val_r += isize::from(bb1[0]) - isize::from(bb2[0]);
+                val_g += isize::from(bb1[1]) - isize::from(bb2[1]);
+                val_b += isize::from(bb1[2]) - isize::from(bb2[2]);
+                val_a += isize::from(bb1[3]) - isize::from(bb2[3]);
+
+                frontbuf[ti] = [
+                    round(val_r as f32 * iarr) as u8,
+                    round(val_g as f32 * iarr) as u8,
+                    round(val_b as f32 * iarr) as u8,
+                    round(val_a as f32 * iarr) as u8,
+                ];
+                ti += width;
+            }
+
+            for _ in 0..min(height - blur_radius - 1, blur_radius) {
+                let bb = get_top(li);
+                li += width;
+
+                val_r += isize::from(lv[0]) - isize::from(bb[0]);
+                val_g += isize::from(lv[1]) - isize::from(bb[1]);
+                val_b += isize::from(lv[2]) - isize::from(bb[2]);
+                val_a += isize::from(lv[3]) - isize::from(bb[3]);
+
+                frontbuf[ti] = [
+                    round(val_r as f32 * iarr) as u8,
+                    round(val_g as f32 * iarr) as u8,
+                    round(val_b as f32 * iarr) as u8,
+                    round(val_a as f32 * iarr) as u8,
+                ];
+                ti += width;
+            }
+        }
+    }
+}
+
+#[inline]
+fn box_blur_horz(
+    backbuf: &[[u8; 4]],
+    frontbuf: &mut [[u8; 4]],
+    width: usize,
+    height: usize,
+    blur_radius: usize,
+) {
+    let iarr = 1.0 / (blur_radius + blur_radius + 1) as f32;
+
+    for i in 0..height {
+        let row_start: usize = i * width; // inclusive
+        let row_end: usize = (i + 1) * width - 1; // inclusive
+        let mut ti: usize = i * width; // VERTICAL: $i;
+        let mut li: usize = ti;
+        let mut ri: usize = ti + blur_radius;
+
+        let fv: [u8; 4] = backbuf[row_start];
+        let lv: [u8; 4] = backbuf[row_end]; // VERTICAL: $backbuf[ti + $width - 1];
+
+        let mut val_r: isize = (blur_radius as isize + 1) * isize::from(fv[0]);
+        let mut val_g: isize = (blur_radius as isize + 1) * isize::from(fv[1]);
+        let mut val_b: isize = (blur_radius as isize + 1) * isize::from(fv[2]);
+        let mut val_a: isize = (blur_radius as isize + 1) * isize::from(fv[3]);
+
+        // Get the pixel at the specified index, or the first pixel of the row
+        // if the index is beyond the left edge of the image
+        let get_left = |i: usize| {
+            if i < row_start {
+                fv
+            } else {
+                backbuf[i]
+            }
+        };
+
+        // Get the pixel at the specified index, or the last pixel of the row
+        // if the index is beyond the right edge of the image
+        let get_right = |i: usize| {
+            if i > row_end {
+                lv
+            } else {
+                backbuf[i]
+            }
+        };
+
+        for j in 0..min(blur_radius, width) {
+            let bb = backbuf[ti + j]; // VERTICAL: ti + j * width
+            val_r += isize::from(bb[0]);
+            val_g += isize::from(bb[1]);
+            val_b += isize::from(bb[2]);
+            val_a += isize::from(bb[3]);
+        }
+        if blur_radius > width {
+            val_r += (blur_radius - height) as isize * isize::from(lv[0]);
+            val_g += (blur_radius - height) as isize * isize::from(lv[1]);
+            val_b += (blur_radius - height) as isize * isize::from(lv[2]);
+            val_a += (blur_radius - height) as isize * isize::from(lv[3]);
+        }
+
+        // Process the left side where we need pixels from beyond the left edge
+        for _ in 0..min(width, blur_radius + 1) {
+            let bb = get_right(ri);
+            ri += 1;
+            val_r += isize::from(bb[0]) - isize::from(fv[0]);
+            val_g += isize::from(bb[1]) - isize::from(fv[1]);
+            val_b += isize::from(bb[2]) - isize::from(fv[2]);
+            val_a += isize::from(bb[2]) - isize::from(fv[3]);
+
+            frontbuf[ti] = [
+                round(val_r as f32 * iarr) as u8,
+                round(val_g as f32 * iarr) as u8,
+                round(val_b as f32 * iarr) as u8,
+                round(val_a as f32 * iarr) as u8,
+            ];
+            ti += 1; // VERTICAL : ti += width, same with the other areas
+        }
+
+        if width > blur_radius {
+            // otherwise `(width - blur_radius)` will underflow
+            // Process the middle where we know we won't bump into borders
+            // without the extra indirection of get_left/get_right. This is faster.
+            for _ in (blur_radius + 1)..(width - blur_radius) {
+                let bb1 = backbuf[ri];
+                ri += 1;
+                let bb2 = backbuf[li];
+                li += 1;
+
+                val_r += isize::from(bb1[0]) - isize::from(bb2[0]);
+                val_g += isize::from(bb1[1]) - isize::from(bb2[1]);
+                val_b += isize::from(bb1[2]) - isize::from(bb2[2]);
+                val_a += isize::from(bb1[3]) - isize::from(bb2[3]);
+
+                frontbuf[ti] = [
+                    round(val_r as f32 * iarr) as u8,
+                    round(val_g as f32 * iarr) as u8,
+                    round(val_b as f32 * iarr) as u8,
+                    round(val_a as f32 * iarr) as u8,
+                ];
+                ti += 1;
+            }
+
+            // Process the right side where we need pixels from beyond the right edge
+            for _ in 0..min(width - blur_radius - 1, blur_radius) {
+                let bb = get_left(li);
+                li += 1;
+
+                val_r += isize::from(lv[0]) - isize::from(bb[0]);
+                val_g += isize::from(lv[1]) - isize::from(bb[1]);
+                val_b += isize::from(lv[2]) - isize::from(bb[2]);
+                val_a += isize::from(lv[3]) - isize::from(bb[3]);
+
+                frontbuf[ti] = [
+                    round(val_r as f32 * iarr) as u8,
+                    round(val_g as f32 * iarr) as u8,
+                    round(val_b as f32 * iarr) as u8,
+                    round(val_a as f32 * iarr) as u8,
+                ];
+                ti += 1;
+            }
+        }
+    }
+}
+
+#[inline]
+/// Fast rounding for x <= 2^23.
+/// This is orders of magnitude faster than built-in rounding intrinsic.
+///
+/// Source: https://stackoverflow.com/a/42386149/585725
+fn round(mut x: f32) -> f32 {
+    x += 12582912.0;
+    x -= 12582912.0;
+    x
 }
--- a/src/config.rs
+++ b/src/config.rs
@ -85,7 +85,7 @@ pub struct Config {
    pub shadow_color: Rgba<u8>,

    /// Blur radius of the shadow
-    #[structopt(long, value_name = "radius", default_value = "10.0")]
+    #[structopt(long, value_name = "radius", default_value = "70.0")]
    pub shadow_blur_radius: f32,

    /// Pad horiz
--- a/src/gauss/gauss.c
+++ b/src/gauss/gauss.c
@ -1,323 +0,0 @@
-// https://www.cnblogs.com/cpuimage/p/5291660.html
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#if defined _WIN32 || defined __CYGWIN__
-#ifdef BUILDING_DLL
-#ifdef __GNUC__
-#define DLL_PUBLIC __attribute__((dllexport))
-#else
-#define DLL_PUBLIC                                                             \
-  __declspec(                                                                  \
-      dllexport) // Note: actually gcc seems to also supports this syntax.
-#endif
-#else
-#ifdef __GNUC__
-#define DLL_PUBLIC __attribute__((dllimport))
-#else
-#define DLL_PUBLIC                                                             \
-  __declspec(                                                                  \
-      dllimport) // Note: actually gcc seems to also supports this syntax.
-#endif
-#endif
-#define DLL_LOCAL
-#else
-#if __GNUC__ >= 4
-#define DLL_PUBLIC __attribute__((visibility("default")))
-#define DLL_LOCAL __attribute__((visibility("hidden")))
-#else
-#define DLL_PUBLIC
-#define DLL_LOCAL
-#endif
-#endif
-
-void CalGaussianCoeff(float sigma, float *a0, float *a1, float *a2, float *a3,
-                      float *b1, float *b2, float *cprev, float *cnext) {
-  float alpha, lamma, k;
-
-  if (sigma < 0.5f)
-    sigma = 0.5f;
-  alpha = (float)exp((0.726) * (0.726)) / sigma;
-  lamma = (float)exp(-alpha);
-  *b2 = (float)exp(-2 * alpha);
-  k = (1 - lamma) * (1 - lamma) / (1 + 2 * alpha * lamma - (*b2));
-  *a0 = k;
-  *a1 = k * (alpha - 1) * lamma;
-  *a2 = k * (alpha + 1) * lamma;
-  *a3 = -k * (*b2);
-  *b1 = -2 * lamma;
-  *cprev = (*a0 + *a1) / (1 + *b1 + *b2);
-  *cnext = (*a2 + *a3) / (1 + *b1 + *b2);
-}
-
-void gaussianHorizontal(unsigned char *bufferPerLine,
-                        unsigned char *lpRowInitial, unsigned char *lpColumn,
-                        int width, int height, int Channels,
-                        float a0a1, float a2a3, float b1b2, float cprev,
-                        float cnext) {
-  int HeightStep = Channels * height;
-  int WidthSubOne = width - 1;
-  if (Channels == 3) {
-    float prevOut[3];
-    prevOut[0] = (lpRowInitial[0] * cprev);
-    prevOut[1] = (lpRowInitial[1] * cprev);
-    prevOut[2] = (lpRowInitial[2] * cprev);
-    for (int x = 0; x < width; ++x) {
-      prevOut[0] = ((lpRowInitial[0] * (a0a1)) - (prevOut[0] * (b1b2)));
-      prevOut[1] = ((lpRowInitial[1] * (a0a1)) - (prevOut[1] * (b1b2)));
-      prevOut[2] = ((lpRowInitial[2] * (a0a1)) - (prevOut[2] * (b1b2)));
-      bufferPerLine[0] = prevOut[0];
-      bufferPerLine[1] = prevOut[1];
-      bufferPerLine[2] = prevOut[2];
-      bufferPerLine += Channels;
-      lpRowInitial += Channels;
-    }
-    lpRowInitial -= Channels;
-    lpColumn += HeightStep * WidthSubOne;
-    bufferPerLine -= Channels;
-    prevOut[0] = (lpRowInitial[0] * cnext);
-    prevOut[1] = (lpRowInitial[1] * cnext);
-    prevOut[2] = (lpRowInitial[2] * cnext);
-
-    for (int x = WidthSubOne; x >= 0; --x) {
-      prevOut[0] = ((lpRowInitial[0] * (a2a3)) - (prevOut[0] * (b1b2)));
-      prevOut[1] = ((lpRowInitial[1] * (a2a3)) - (prevOut[1] * (b1b2)));
-      prevOut[2] = ((lpRowInitial[2] * (a2a3)) - (prevOut[2] * (b1b2)));
-      bufferPerLine[0] += prevOut[0];
-      bufferPerLine[1] += prevOut[1];
-      bufferPerLine[2] += prevOut[2];
-      lpColumn[0] = bufferPerLine[0];
-      lpColumn[1] = bufferPerLine[1];
-      lpColumn[2] = bufferPerLine[2];
-      lpRowInitial -= Channels;
-      lpColumn -= HeightStep;
-      bufferPerLine -= Channels;
-    }
-  } else if (Channels == 4) {
-    float prevOut[4];
-
-    prevOut[0] = (lpRowInitial[0] * cprev);
-    prevOut[1] = (lpRowInitial[1] * cprev);
-    prevOut[2] = (lpRowInitial[2] * cprev);
-    prevOut[3] = (lpRowInitial[3] * cprev);
-    for (int x = 0; x < width; ++x) {
-      prevOut[0] = ((lpRowInitial[0] * (a0a1)) - (prevOut[0] * (b1b2)));
-      prevOut[1] = ((lpRowInitial[1] * (a0a1)) - (prevOut[1] * (b1b2)));
-      prevOut[2] = ((lpRowInitial[2] * (a0a1)) - (prevOut[2] * (b1b2)));
-      prevOut[3] = ((lpRowInitial[3] * (a0a1)) - (prevOut[3] * (b1b2)));
-
-      bufferPerLine[0] = prevOut[0];
-      bufferPerLine[1] = prevOut[1];
-      bufferPerLine[2] = prevOut[2];
-      bufferPerLine[3] = prevOut[3];
-      bufferPerLine += Channels;
-      lpRowInitial += Channels;
-    }
-    lpRowInitial -= Channels;
-    lpColumn += HeightStep * WidthSubOne;
-    bufferPerLine -= Channels;
-
-    prevOut[0] = (lpRowInitial[0] * cnext);
-    prevOut[1] = (lpRowInitial[1] * cnext);
-    prevOut[2] = (lpRowInitial[2] * cnext);
-    prevOut[3] = (lpRowInitial[3] * cnext);
-
-    for (int x = WidthSubOne; x >= 0; --x) {
-      prevOut[0] = ((lpRowInitial[0] * a2a3) - (prevOut[0] * b1b2));
-      prevOut[1] = ((lpRowInitial[1] * a2a3) - (prevOut[1] * b1b2));
-      prevOut[2] = ((lpRowInitial[2] * a2a3) - (prevOut[2] * b1b2));
-      prevOut[3] = ((lpRowInitial[3] * a2a3) - (prevOut[3] * b1b2));
-      bufferPerLine[0] += prevOut[0];
-      bufferPerLine[1] += prevOut[1];
-      bufferPerLine[2] += prevOut[2];
-      bufferPerLine[3] += prevOut[3];
-      lpColumn[0] = bufferPerLine[0];
-      lpColumn[1] = bufferPerLine[1];
-      lpColumn[2] = bufferPerLine[2];
-      lpColumn[3] = bufferPerLine[3];
-      lpRowInitial -= Channels;
-      lpColumn -= HeightStep;
-      bufferPerLine -= Channels;
-    }
-  } else if (Channels == 1) {
-    float prevOut = (lpRowInitial[0] * cprev);
-
-    for (int x = 0; x < width; ++x) {
-      prevOut = ((lpRowInitial[0] * (a0a1)) - (prevOut * (b1b2)));
-      bufferPerLine[0] = prevOut;
-      bufferPerLine += Channels;
-      lpRowInitial += Channels;
-    }
-    lpRowInitial -= Channels;
-    lpColumn += HeightStep * WidthSubOne;
-    bufferPerLine -= Channels;
-
-    prevOut = (lpRowInitial[0] * cnext);
-
-    for (int x = WidthSubOne; x >= 0; --x) {
-      prevOut = ((lpRowInitial[0] * a2a3) - (prevOut * b1b2));
-      bufferPerLine[0] += prevOut;
-      lpColumn[0] = bufferPerLine[0];
-      lpRowInitial -= Channels;
-      lpColumn -= HeightStep;
-      bufferPerLine -= Channels;
-    }
-  }
-}
-
-void gaussianVertical(unsigned char *bufferPerLine, unsigned char *lpRowInitial,
-                      unsigned char *lpColInitial, int height, int width,
-                      int Channels, float a0a1, float a2a3, float b1b2,
-                      float cprev, float cnext) {
-
-  int WidthStep = Channels * width;
-  int HeightSubOne = height - 1;
-  if (Channels == 3) {
-    float prevOut[3];
-    prevOut[0] = (lpRowInitial[0] * cprev);
-    prevOut[1] = (lpRowInitial[1] * cprev);
-    prevOut[2] = (lpRowInitial[2] * cprev);
-
-    for (int y = 0; y < height; y++) {
-      prevOut[0] = ((lpRowInitial[0] * a0a1) - (prevOut[0] * b1b2));
-      prevOut[1] = ((lpRowInitial[1] * a0a1) - (prevOut[1] * b1b2));
-      prevOut[2] = ((lpRowInitial[2] * a0a1) - (prevOut[2] * b1b2));
-      bufferPerLine[0] = prevOut[0];
-      bufferPerLine[1] = prevOut[1];
-      bufferPerLine[2] = prevOut[2];
-      bufferPerLine += Channels;
-      lpRowInitial += Channels;
-    }
-    lpRowInitial -= Channels;
-    bufferPerLine -= Channels;
-    lpColInitial += WidthStep * HeightSubOne;
-    prevOut[0] = (lpRowInitial[0] * cnext);
-    prevOut[1] = (lpRowInitial[1] * cnext);
-    prevOut[2] = (lpRowInitial[2] * cnext);
-    for (int y = HeightSubOne; y >= 0; y--) {
-      prevOut[0] = ((lpRowInitial[0] * a2a3) - (prevOut[0] * b1b2));
-      prevOut[1] = ((lpRowInitial[1] * a2a3) - (prevOut[1] * b1b2));
-      prevOut[2] = ((lpRowInitial[2] * a2a3) - (prevOut[2] * b1b2));
-      bufferPerLine[0] += prevOut[0];
-      bufferPerLine[1] += prevOut[1];
-      bufferPerLine[2] += prevOut[2];
-      lpColInitial[0] = bufferPerLine[0];
-      lpColInitial[1] = bufferPerLine[1];
-      lpColInitial[2] = bufferPerLine[2];
-      lpRowInitial -= Channels;
-      lpColInitial -= WidthStep;
-      bufferPerLine -= Channels;
-    }
-  } else if (Channels == 4) {
-    float prevOut[4];
-
-    prevOut[0] = (lpRowInitial[0] * cprev);
-    prevOut[1] = (lpRowInitial[1] * cprev);
-    prevOut[2] = (lpRowInitial[2] * cprev);
-    prevOut[3] = (lpRowInitial[3] * cprev);
-
-    for (int y = 0; y < height; y++) {
-      prevOut[0] = ((lpRowInitial[0] * a0a1) - (prevOut[0] * b1b2));
-      prevOut[1] = ((lpRowInitial[1] * a0a1) - (prevOut[1] * b1b2));
-      prevOut[2] = ((lpRowInitial[2] * a0a1) - (prevOut[2] * b1b2));
-      prevOut[3] = ((lpRowInitial[3] * a0a1) - (prevOut[3] * b1b2));
-      bufferPerLine[0] = prevOut[0];
-      bufferPerLine[1] = prevOut[1];
-      bufferPerLine[2] = prevOut[2];
-      bufferPerLine[3] = prevOut[3];
-      bufferPerLine += Channels;
-      lpRowInitial += Channels;
-    }
-    lpRowInitial -= Channels;
-    bufferPerLine -= Channels;
-    lpColInitial += WidthStep * HeightSubOne;
-    prevOut[0] = (lpRowInitial[0] * cnext);
-    prevOut[1] = (lpRowInitial[1] * cnext);
-    prevOut[2] = (lpRowInitial[2] * cnext);
-    prevOut[3] = (lpRowInitial[3] * cnext);
-    for (int y = HeightSubOne; y >= 0; y--) {
-      prevOut[0] = ((lpRowInitial[0] * a2a3) - (prevOut[0] * b1b2));
-      prevOut[1] = ((lpRowInitial[1] * a2a3) - (prevOut[1] * b1b2));
-      prevOut[2] = ((lpRowInitial[2] * a2a3) - (prevOut[2] * b1b2));
-      prevOut[3] = ((lpRowInitial[3] * a2a3) - (prevOut[3] * b1b2));
-      bufferPerLine[0] += prevOut[0];
-      bufferPerLine[1] += prevOut[1];
-      bufferPerLine[2] += prevOut[2];
-      bufferPerLine[3] += prevOut[3];
-      lpColInitial[0] = bufferPerLine[0];
-      lpColInitial[1] = bufferPerLine[1];
-      lpColInitial[2] = bufferPerLine[2];
-      lpColInitial[3] = bufferPerLine[3];
-      lpRowInitial -= Channels;
-      lpColInitial -= WidthStep;
-      bufferPerLine -= Channels;
-    }
-  } else if (Channels == 1) {
-    float prevOut = 0;
-    prevOut = (lpRowInitial[0] * cprev);
-    for (int y = 0; y < height; y++) {
-      prevOut = ((lpRowInitial[0] * a0a1) - (prevOut * b1b2));
-      bufferPerLine[0] = prevOut;
-      bufferPerLine += Channels;
-      lpRowInitial += Channels;
-    }
-    lpRowInitial -= Channels;
-    bufferPerLine -= Channels;
-    lpColInitial += WidthStep * HeightSubOne;
-    prevOut = (lpRowInitial[0] * cnext);
-    for (int y = HeightSubOne; y >= 0; y--) {
-      prevOut = ((lpRowInitial[0] * a2a3) - (prevOut * b1b2));
-      bufferPerLine[0] += prevOut;
-      lpColInitial[0] = bufferPerLine[0];
-      lpRowInitial -= Channels;
-      lpColInitial -= WidthStep;
-      bufferPerLine -= Channels;
-    }
-  }
-}
-
-//本人博客:http://tntmonks.cnblogs.com/ 转载请注明出处.
-DLL_PUBLIC void GaussianBlurFilter(unsigned char *input,
-                                         unsigned char *output, int Width,
-                                         int Height, int Stride,
-                                         float GaussianSigma) {
-
-  int Channels = Stride / Width;
-  float a0, a1, a2, a3, b1, b2, cprev, cnext;
-
-  CalGaussianCoeff(GaussianSigma, &a0, &a1, &a2, &a3, &b1, &b2, &cprev, &cnext);
-
-  float a0a1 = (a0 + a1);
-  float a2a3 = (a2 + a3);
-  float b1b2 = (b1 + b2);
-
-  int bufferSizePerThread = (Width > Height ? Width : Height) * Channels;
-  unsigned char *bufferPerLine = (unsigned char *)malloc(bufferSizePerThread);
-  unsigned char *tempData = (unsigned char *)malloc(Height * Stride);
-  if (bufferPerLine == NULL || tempData == NULL) {
-    if (tempData) {
-      free(tempData);
-    }
-    if (bufferPerLine) {
-      free(bufferPerLine);
-    }
-    return;
-  }
-  for (int y = 0; y < Height; ++y) {
-    unsigned char *lpRowInitial = input + Stride * y;
-    unsigned char *lpColInitial = tempData + y * Channels;
-    gaussianHorizontal(bufferPerLine, lpRowInitial, lpColInitial, Width, Height,
-                       Channels, a0a1, a2a3, b1b2, cprev, cnext);
-  }
-  int HeightStep = Height * Channels;
-  for (int x = 0; x < Width; ++x) {
-    unsigned char *lpColInitial = output + x * Channels;
-    unsigned char *lpRowInitial = tempData + HeightStep * x;
-    gaussianVertical(bufferPerLine, lpRowInitial, lpColInitial, Height, Width,
-                     Channels, a0a1, a2a3, b1b2, cprev, cnext);
-  }
-
-  free(bufferPerLine);
-  free(tempData);
-}