Blog of roxlu, co-founder of Apollo Media. Contact info[shift+2]apollomedia.nl.

Fast OpenGL blur shader

There have been several proposed several optimized solution for applying a Gaussian blur to a texture with OpenGL. Normally with a blur shader you fetch the color for a specific pixel, then fetch some pixels around this particular pixel and combine all values by some weight. This weight is based on the gaussian function. How many pixels you fetch for each outgoing pixel determines the quality of the blur effect but also adds extra costs and slows down your your rendering.

Daniel Rákos described an interesting solution to reduce the number of texture fetches which are costly operations compared to lets say multiplying some value in a shader. Therefore by reducing the number of texture fetches speeds up the blur shader. You can read up on the details in the article of Daniel. In short he describes a technique where you make use of the linear sampling feature of openGL. Linear sampling is highly optimized and basically 'free' compared to the number of extra texture fetches one does without this optimized solution.

Instead of doing 9 texture fetches (per horizontal and vertical blur step) you only need to do 5 fetches instead for the same kernel size. The trick is that the color value you fetch is automatically interpolated by the hardware of your GPU and combining this with adjusted weights you have to do less fetches. Therefore for this to work you need to change the weights and texture coordinates (offsets) a little bit so they returns a similar adjusted value.

The image belows shows the formula how one can adjust the normal guassian weights and offsets for the optimized version that uses linear sampling. Credits for this formula go to daniel.

Blur performance and quality results

Another trivial way to speed up the blur process is to simply reduce the size of the texture that you want to blur. In the images below I've used several different sizes and blur steps. when you reduce the size of the input texture for the blur you'll see that the quality is a lot less. Though this can be easily fixed by applying a couple more blur passes. You have to find the best mix between scale and the number of blur passes you want to do.

Source code

The code below can be used as an example on how to implement an optimized blur shader with a fixed kernel size and user defined sigma.

#ifndef GFX_BLUR_H
#define GFX_BLUR_H
 
#include <glad/glad.h>
 
#define ROXLU_USE_LOG
#define ROXLU_USE_OPENGL
#define ROXLU_USE_MATH
#include <tinylib.h>
 
static const char* BLUR_VS = ""
  "#version 330\n"
  ""
  " const vec2[] pos = vec2[4]("
  "   vec2(-1.0, 1.0),"
  "   vec2(-1.0, -1.0),"
  "   vec2(1.0, 1.0),"
  "   vec2(1.0, -1.0)"
  "   );"
  ""
  "const vec2 texcoords[4] = vec2[] ("
  "  vec2(0.0, 1.0), "
  "  vec2(0.0, 0.0), "
  "  vec2(1.0, 1.0), "
  "  vec2(1.0, 0.0)  "
  "); "
  ""
  "out vec2 v_tex;"
  ""
  "void main() {"
  "  gl_Position = vec4(pos[gl_VertexID], 0.0, 1.0);"
  "  v_tex = texcoords[gl_VertexID];"
  " }" 
  "";
 
 
namespace gfx { 
  class Blur {
  public:
    Blur();
    ~Blur();
    int init(double amount);
    void blurX(float w, float h);
    void blurY(float w, float h);
 
  public:
    GLuint vao;
    GLuint vert;
    GLuint frag_y;
    GLuint frag_x;
    GLuint prog_x;
    GLuint prog_y;
    GLint xtex_w;
    GLint xtex_h;
    GLint ytex_w;
    GLint ytex_h;
  };
} /* namespace gfx */
 
#endif

The implmentation has both the optimized and non-optimized version. You can enable/disable it by using the #if in the Blur::init() function.

#include <vector>
#include <sstream>
#include <math.h>
#include <gfx/Blur.h>
 
namespace gfx {
 
  /* -------------------------------------------------------------------------------- */
 
  static float gauss(float x, float s2);
 
  /* -------------------------------------------------------------------------------- */
 
  Blur::Blur() 
    :vert(0)
    ,frag_x(0)
    ,frag_y(0)
    ,prog_x(0)
    ,prog_y(0)
    ,vao(0)
  {
  }
 
  Blur::~Blur() {
  }
 
  int Blur::init(double amount) {
    RX_VERBOSE("Creating blur shader - check the effect of having the first sum like: sum = weights[0] * 2.0, is better");
 
#if 1
    /* OPTIMIZED VERSION */
    float sum = 0.0;
    float weights[5] = { 0.0f } ;
    float offsets[5] = { 0.0, 1.0, 2.0, 3.0, 4.0 } ;
 
    /* Calculate the weights */
    weights[0] = gauss(0, amount);
    sum = weights[0];  //     sum = weights[0] * 2.0;
    for (int i = 1; i < 5; ++i) {
      weights[i] = gauss(i, amount);
      sum += 2.0 * weights[i];
    }
    for (int i = 0; i < 5; ++i) {
      weights[i] /= sum;
    }
 
    /* fix for just 3 fetches */
    float new_weights[3] = { weights[0], weights[1] + weights[2], weights[3] + weights[4] } ;
    float new_offsets[3] = { 0.0f };
    new_offsets[0] = 0.0f;
    new_offsets[1] = ( (weights[1] * offsets[1]) + (weights[2] * offsets[2]) ) / new_weights[1];
    new_offsets[2] = ( (weights[3] * offsets[3]) + (weights[4] * offsets[4]) ) / new_weights[2];
 
    /* create the shader */
    std::stringstream ss_open;
    ss_open << "#version 330\n"
            << "uniform sampler2D u_tex;\n"
            << "uniform float u_tex_w;\n"
            << "uniform float u_tex_h;\n"
            << "in vec2 v_tex;\n"
            << "layout( location = 0 ) out vec4 fragcolor;\n"
            << "\n"
            << "void main() {\n"
            << "  float sy = 1.0 / u_tex_h;\n"
            << "  float sx = 1.0 / u_tex_w;\n"
            << "";
 
    /* create the texture lookups */
    std::stringstream ss_y, ss_x;
    ss_y << "  fragcolor = texture(u_tex, v_tex) * " << new_weights[0] << ";\n";
    ss_x << "  fragcolor = texture(u_tex, v_tex) * " << new_weights[0] << ";\n";
 
    for (int i = 1; i < 3; ++i) {
      ss_y << "  fragcolor += texture(u_tex, vec2(v_tex.s, v_tex.y + (" << new_offsets[i] << " * sy))) * " << new_weights[i] << ";\n";
      ss_y << "  fragcolor += texture(u_tex, vec2(v_tex.s, v_tex.y - (" << new_offsets[i] << " * sy))) * " << new_weights[i] << ";\n";
      ss_x << "  fragcolor += texture(u_tex, vec2(v_tex.s + (" << new_offsets[i] << " * sx), v_tex.t)) * " << new_weights[i] << ";\n";
      ss_x << "  fragcolor += texture(u_tex, vec2(v_tex.s - (" << new_offsets[i] << " * sx), v_tex.t)) * " << new_weights[i] << ";\n";
    }
 
    ss_y << "}\n";
    ss_x << "}\n";
 
#else 
    /* UNOPTIMIZED */
    float sum = 0.0;
    float weights[5] = { 0.0f } ;
    float offsets[5] = { 0.0, 1.0, 2.0, 3.0, 4.0 } ;
 
    /* Calculate the weights */
    weights[0] = gauss(0, amount);
    sum = weights[0];  //     sum = weights[0] * 2.0;
    for (int i = 1; i < 5; ++i) {
      weights[i] = gauss(i, amount);
      sum += 2.0 * weights[i];
    }
    for (int i = 0; i < 5; ++i) {
      weights[i] /= sum;
    }
 
    /* create the shader */
    std::stringstream ss_open;
    ss_open << "#version 330\n"
            << "uniform sampler2D u_tex;\n"
            << "uniform float u_tex_w;\n"
            << "uniform float u_tex_h;\n"
            << "in vec2 v_tex;\n"
            << "layout( location = 0 ) out vec4 fragcolor;\n"
            << "\n"
            << "void main() {\n"
            << "  float sy = 1.0 / u_tex_h;\n"
            << "  float sx = 1.0 / u_tex_w;\n"
            << "";
 
 
    /* create the texture lookups */
    std::stringstream ss_y, ss_x;
    ss_y << "  fragcolor = texture(u_tex, v_tex) * " << weights[0] << ";\n";
    ss_x << "  fragcolor = texture(u_tex, v_tex) * " << weights[0] << ";\n";
 
    for (int i = 1; i < 5; ++i) {
      ss_y << "  fragcolor += texture(u_tex, vec2(v_tex.s, v_tex.y + (" << offsets[i] << ".0 * sy))) * " << weights[i] << ";\n";
      ss_y << "  fragcolor += texture(u_tex, vec2(v_tex.s, v_tex.y - (" << offsets[i] << ".0 * sy))) * " << weights[i] << ";\n";
      ss_x << "  fragcolor += texture(u_tex, vec2(v_tex.s + (" << offsets[i] << ".0 * sx), v_tex.t)) * " << weights[i] << ";\n";
      ss_x << "  fragcolor += texture(u_tex, vec2(v_tex.s - (" << offsets[i] << ".0 * sx), v_tex.t)) * " << weights[i] << ";\n";
    }
 
    ss_y << "}\n";
    ss_x << "}\n";
#endif
 
    std::string yfrag = ss_open.str() + ss_y.str();
    std::string xfrag = ss_open.str() + ss_x.str();
 
    /* create the shaders */
    vert = rx_create_shader(GL_VERTEX_SHADER, BLUR_VS);
    frag_x = rx_create_shader(GL_FRAGMENT_SHADER, xfrag.c_str());
    frag_y = rx_create_shader(GL_FRAGMENT_SHADER, yfrag.c_str());
    prog_x = rx_create_program(vert, frag_x, true);
    prog_y = rx_create_program(vert, frag_y, true);
 
    /* set the texture binding points */
    glUseProgram(prog_x);
    glUniform1i(glGetUniformLocation(prog_x, "u_tex"), 0);
    xtex_w = glGetUniformLocation(prog_x, "u_tex_w");
    xtex_h = glGetUniformLocation(prog_x, "u_tex_h");
 
    glUseProgram(prog_y);
    glUniform1i(glGetUniformLocation(prog_y, "u_tex"), 0);
    ytex_w = glGetUniformLocation(prog_y, "u_tex_w");
    ytex_h = glGetUniformLocation(prog_y, "u_tex_h");
 
    /* create our vao. */
    glGenVertexArrays(1, &vao);
 
    return 0;
  }
 
  void Blur::blurX(float w, float h) {
 
    /* make sure init has been called. */
    if (0 == prog_x || 0 == prog_y) {
      RX_ERROR("Shaders not initialized");
      return;
    }
 
    glBindVertexArray(vao);
    glUseProgram(prog_x);
    glUniform1f(xtex_w, w);
    glUniform1f(xtex_h, h);
    glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
  }
 
  void Blur::blurY(float w, float h) {
 
    /* make sure init has been called. */
    if (0 == prog_x || 0 == prog_y) {
      RX_ERROR("Shaders not initialized");
      return;
    }
 
    glBindVertexArray(vao);
    glUseProgram(prog_y);
    glUniform1f(ytex_w, w);
    glUniform1f(ytex_h, h);
    glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
  }
 
 
  /* -------------------------------------------------------------------------------- */
 
  static float gauss(float x, float s2) {
    double c = 1.0 / (2.0 * 3.14159265359 * s2);
    double e = -(x * x) / (2.0 * s2);
    return (float) (c * exp(e));
  }
 
} /* namespace gfx */