Files
mpv-config/shaders/other/nlmeans_luma.glsl
T
2026-03-27 07:06:16 +01:00

1036 lines
28 KiB
GLSL

/* vi: ft=c
*
* Based on vf_nlmeans.c from FFmpeg.
*
* Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
* Copyright (c) 2016 Clément Bœsch <u pkh me>
*
* This program is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 2.1 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
// Profile description: Luma only, saves some time by not processing the chroma plane.
/* The recommended usage of this shader and its variant profiles is to add them
* to input.conf and then dispatch the appropriate shader via a keybind during
* media playback. Here is an example input.conf entry:
*
* F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)"
*
* These shaders can also be enabled by default in mpv.conf, for example:
*
* glsl-shaders='~~/shaders/nlmeans.glsl'
*
* Both of the examples above assume the shaders are located in a subdirectory
* named "shaders" within mpv's config directory. Refer to the mpv
* documentation for more details.
*
* This shader is highly configurable via user variables below. Although the
* default settings should offer good quality at a reasonable speed, you are
* encouraged to tweak them to your preferences. Be mindful that certain
* settings may greatly affect speed.
*
* Denoising is most useful for noisy content. If there is no perceptible
* noise, you probably won't see a positive difference.
*
* The default settings are generally tuned for low noise and high detail
* preservation. The "medium" and "heavy" profiles are tuned for higher levels
* of noise.
*
* The denoiser will not work properly if the content has been upscaled
* beforehand, whether it was done by you or someone down the line. Consider
* issuing a command to downscale in the mpv console, like so:
*
* vf toggle scale=-2:720
*
* ...replacing 720 with whatever resolution seems appropriate. Rerun the
* command to undo the downscale. It may take some trial-and-error to find the
* proper resolution.
*/
/* Regarding speed
*
* Speed may vary wildly for different vo and gpu-api settings. Generally
* vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this
* may be different for your system.
*
* If your GPU doesn't support textureGather, or if you are on a version of mpv
* prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ and VLQ
* profiles.
*
* textureGather is LUMA only and limited to the following configurations:
*
* - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}:M!=1
* - Default, very fast, rotations and reflections should be free
* - If this is unusually slow then try changing gpu-api and vo
* - If it's still slow, try setting RI/RFI to 0.
*
* - PS=6:RI={0,1,3}:RFI={0,1,2}
* - Currently the only scalable variant
* - Patch shape is asymmetric on two axis
* - Rotations should have very little speed impact
* - Reflections may have a significant speed impact
*
* Options which always disable textureGather:
* - RF
* - PD
*/
// The following is shader code injected from guided_s.glsl
/* vi: ft=c
*
* Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
*
* This program is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 2.1 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
//desc: "Self-guided" guided filter
/* The radius can be adjusted with the MEANIP stage's downscaling factor.
* Higher numbers give a bigger radius.
*
* The E variable can be found in the A stage.
*
* The subsampling (fast guided filter) can be adjusted with the IP stage's
* downscaling factor. Higher numbers are faster.
*/
//!HOOK LUMA
//!DESC Guided filter (IP)
//!BIND HOOKED
//!WIDTH HOOKED.w 1.0 /
//!HEIGHT HOOKED.h 1.0 /
//!SAVE _INJ_IP
vec4 hook()
{
return HOOKED_texOff(0);
}
//!HOOK LUMA
//!DESC Guided filter (MEANIP)
//!BIND _INJ_IP
//!WIDTH _INJ_IP.w 2.0 /
//!HEIGHT _INJ_IP.h 2.0 /
//!SAVE _INJ_MEANIP
vec4 hook()
{
return _INJ_IP_texOff(0);
}
//!HOOK LUMA
//!DESC Guided filter (_INJ_IP_SQ)
//!BIND _INJ_IP
//!WIDTH _INJ_IP.w
//!HEIGHT _INJ_IP.h
//!SAVE _INJ_IP_SQ
vec4 hook()
{
return _INJ_IP_texOff(0) * _INJ_IP_texOff(0);
}
//!HOOK LUMA
//!DESC Guided filter (CORRIP)
//!BIND _INJ_IP_SQ
//!WIDTH _INJ_MEANIP.w
//!HEIGHT _INJ_MEANIP.h
//!SAVE _INJ_CORRIP
vec4 hook()
{
return _INJ_IP_SQ_texOff(0);
}
//!HOOK LUMA
//!DESC Guided filter (A)
//!BIND _INJ_MEANIP
//!BIND _INJ_CORRIP
//!WIDTH _INJ_IP.w
//!HEIGHT _INJ_IP.h
//!SAVE _INJ_A
#define E 0.001
vec4 hook()
{
vec4 var = _INJ_CORRIP_texOff(0) - _INJ_MEANIP_texOff(0) * _INJ_MEANIP_texOff(0);
vec4 cov = var;
return cov / (var + E);
}
//!HOOK LUMA
//!DESC Guided filter (B)
//!BIND _INJ_A
//!BIND _INJ_MEANIP
//!WIDTH _INJ_IP.w
//!HEIGHT _INJ_IP.h
//!SAVE _INJ_B
vec4 hook()
{
return _INJ_MEANIP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANIP_texOff(0);
}
//!HOOK LUMA
//!DESC Guided filter (MEANA)
//!BIND _INJ_A
//!WIDTH _INJ_MEANIP.w
//!HEIGHT _INJ_MEANIP.h
//!SAVE _INJ_MEANA
vec4 hook()
{
return _INJ_A_texOff(0);
}
//!HOOK LUMA
//!DESC Guided filter (MEANB)
//!BIND _INJ_B
//!WIDTH _INJ_MEANIP.w
//!HEIGHT _INJ_MEANIP.h
//!SAVE _INJ_MEANB
vec4 hook()
{
return _INJ_B_texOff(0);
}
//!HOOK LUMA
//!DESC Guided filter
//!BIND HOOKED
//!BIND _INJ_MEANA
//!BIND _INJ_MEANB
//!SAVE RF_LUMA
vec4 hook()
{
return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
}
// End of source code injected from guided_s.glsl
//!HOOK LUMA
//!DESC Non-local means (downscale)
//!WIDTH LUMA.w 3 /
//!HEIGHT LUMA.h 3 /
//!BIND LUMA
//!SAVE EP
vec4 hook()
{
return LUMA_texOff(0);
}
//!HOOK LUMA
//!DESC Non-local means (share)
//!BIND RF_LUMA
//!SAVE RF
vec4 hook()
{
return RF_LUMA_texOff(0);
}
//!HOOK LUMA
//!BIND HOOKED
//!BIND RF_LUMA
//!BIND EP
//!BIND RF
//!DESC Non-local means (nlmeans_luma.glsl)
/* User variables
*
* It is usually preferable to denoise chroma and luma differently, so the user
* variables for luma and chroma are split.
*/
/* S = denoising factor
* P = patch size
* R = research size
*
* The denoising factor controls the level of blur, higher is blurrier.
*
* Patch size should usually be an odd number greater than or equal to 3.
* Higher values are slower and not always better.
*
* Research size usually be an odd number greater than or equal to 3. Higher
* values are usually better, but slower and offer diminishing returns.
*
* Even-numbered patch/research sizes will sample between pixels unless PS=6.
* It's not known whether this is ever useful behavior or not. This is
* incompatible with textureGather optimizations, so enable RF when using even
* patch/research sizes.
*/
#ifdef LUMA_raw
#define S 2.25
#define P 3
#define R 5
#else
#define S 1.50
#define P 3
#define R 5
#endif
/* Adaptive sharpening
*
* Uses the blur incurred by denoising plus the weight map to perform an
* unsharp mask that gets applied most strongly to edges.
*
* Sharpening will amplify noise, so the denoising factor (S) should usually be
* increased to compensate.
*
* AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable
* ASF: Sharpening factor, higher numbers make a sharper underlying image
* ASP: Weight power, higher numbers use more of the sharp image
*/
#ifdef LUMA_raw
#define AS 0
#define ASF 1.0
#define ASP 2.0
#else
#define AS 0
#define ASF 1.0
#define ASP 2.0
#endif
/* Starting weight
*
* Lower numbers give less weight to the pixel-of-interest, which may help
* handle higher noise levels, ringing, and may be useful for other things too?
*
* EPSILON should be used instead of zero to avoid divide-by-zero errors. The
* avg_weight variable may be used to make SW adapt to the local noise level,
* e.g., SW=max(avg_weight, EPSILON)
*/
#ifdef LUMA_raw
#define SW 1.0
#else
#define SW 1.0
#endif
/* Weight discard
*
* Discard weights that fall below a fraction of the average weight. This culls
* the most dissimilar samples from the blur, yielding a much more pleasant
* result, especially around edges.
*
* WD:
* - 2: True average. Very good quality, but slower and uses more memory.
* - 1: Moving cumulative average. Inaccurate, tends to blur directionally.
* - 0: Disable
*
* WDT: Threshold coefficient, higher numbers discard more
* WDP (WD=1): Higher numbers reduce the threshold more for small sample sizes
*/
#ifdef LUMA_raw
#define WD 2
#define WDT 1.0
#define WDP 6.0
#else
#define WD 2
#define WDT 1.0
#define WDP 6.0
#endif
/* Search shape
*
* Determines the shape of patches and research zones. Different shapes have
* different speed and quality characteristics. Every shape (besides square) is
* smaller than square.
*
* PS applies applies to patches, RS applies to research zones.
*
* 0: square (symmetrical)
* 1: horizontal line (asymmetric)
* 2: vertical line (asymmetric)
* 3: diamond (symmetrical)
* 4: triangle (asymmetric, pointing upward)
* 5: truncated triangle (asymmetric on two axis, last row halved)
* 6: even sized square (asymmetric on two axis)
* 7: plus (symmetrical)
*/
#ifdef LUMA_raw
#define RS 3
#define PS 3
#else
#define RS 3
#define PS 3
#endif
/* Rotational/reflectional invariance
*
* Number of rotations/reflections to try for each patch comparison. Slow, but
* improves feature preservation, although adding more rotations/reflections
* gives diminishing returns. The most similar rotation/reflection will be used.
*
* The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a
* single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
*
* RI: Rotational invariance
* RFI (0 to 2): Reflectional invariance
*/
#ifdef LUMA_raw
#define RI 3
#define RFI 2
#else
#define RI 0
#define RFI 0
#endif
/* Temporal denoising
*
* Caveats:
* - Slower, each frame needs to be researched
* - Requires vo=gpu-next and nlmeans_temporal.glsl
* - Luma-only (this is a bug)
* - Buggy
*
* Gather samples across multiple frames. May cause motion blur and may
* struggle more with noise that persists across multiple frames (compression
* noise, repeating frames), but can work very well on high quality video.
*
* Motion estimation (ME) should improve quality without impacting speed.
*
* T: number of frames used
* ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
*/
#ifdef LUMA_raw
#define T 0
#define ME 1
#else
#define T 0
#define ME 0
#endif
/* Spatial kernel
*
* Increasing the spatial denoising factor (SS) reduces the weight of further
* pixels.
*
* Spatial distortion instructs the spatial kernel to view that axis as
* closer/further, for instance SD=(1,1,0.5) would make the temporal axis
* appear closer and increase blur between frames.
*
* The intra-patch variants do not yet have well-understood effects. They are
* intended to make large patch sizes more useful. Likely slower.
*
* SS: spatial denoising factor
* SD: spatial distortion (X, Y, time)
* PSS: intra-patch spatial denoising factor
* PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
* PSD: intra-patch spatial distortion (X, Y)
*/
#ifdef LUMA_raw
#define SS 0.25
#define SD vec3(1,1,1)
#define PST 0
#define PSS 0.0
#define PSD vec2(1,1)
#else
#define SS 0.25
#define SD vec3(1,1,1)
#define PST 0
#define PSS 0.0
#define PSD vec2(1,1)
#endif
/* Extremes preserve
*
* Reduces denoising around very bright/dark areas. The downscaling factor of
* EP (located near the top of this shader) controls the area sampled for
* luminance (higher numbers consider more area).
*
* EP: 1 to enable, 0 to disable
* DP: EP strength on dark patches, 0 to fully denoise
* BP: EP strength on bright patches, 0 to fully denoise
*/
#ifdef LUMA_raw
#define EP 1
#define BP 0.75
#define DP 0.25
#else
#define EP 0
#define BP 0.0
#define DP 0.0
#endif
/* Robust filtering
*
* This setting is dependent on code generation from nlmeans_cfg, so this
* setting can only be enabled via nlmeans_cfg.
*
* Compares the pixel-of-interest against downscaled pixels.
*
* This will virtually always improve quality, but will always disable
* textureGather optimizations.
*
* The downscale factor can be modified in the WIDTH/HEIGHT directives for the
* RF texture (for CHROMA, RGB) and RF_LUMA (LUMA only) textures near the top
* of this shader, higher numbers increase blur.
*
* Any notation of RF as a positive number should be assumed to be referring to
* the downscaling factor, e.g., RF=3 means RF is set to 1 and the downscaling
* factor is set to 3.
*/
#ifdef LUMA_raw
#define RF 1
#else
#define RF 1
#endif
/* Blur factor
*
* 0 to 1, only useful for alternative estimators. You're probably looking for
* "S" (denoising factor), go back to the top of the shader!
*/
#ifdef LUMA_raw
#define BF 1.0
#else
#define BF 1.0
#endif
/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
// Scaling factor (should match WIDTH/HEIGHT)
#ifdef LUMA_raw
#define SF 1
#else
#define SF 1
#endif
/* Estimator
*
* 0: means
* 1: Euclidean medians (extremely slow, may be good for heavy noise)
* 2: weight map (not a denoiser, maybe useful for generating image masks)
* 3: weighted median intensity (slow, may be good for heavy noise)
*/
#ifdef LUMA_raw
#define M 0
#else
#define M 0
#endif
// Patch donut (probably useless)
#ifdef LUMA_raw
#define PD 0
#else
#define PD 0
#endif
// Duplicate 1st weight
#ifdef LUMA_raw
#define D1W 0
#else
#define D1W 0
#endif
/* Shader code */
#define EPSILON 0.00000000001
#if PS == 6
const int hp = P/2;
#else
const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes
#endif
#if RS == 6
const int hr = R/2;
#else
const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes
#endif
// donut increment, increments without landing on (0,0,0)
// much faster than a "continue" statement
#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
// search shapes and their corresponding areas
#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr)
#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr)
#define S_TRIANGLE_A(hz,Z) int(pow(hz, 2)+Z)
#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr)
#define S_DIAMOND_A(hz,Z) int(pow(hz, 2)*2+Z)
#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
#define S_PLUS_A(hz,Z) (Z*2 - 1)
#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
#define T1 (T+1)
#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++)
// Skip comparing the pixel-of-interest against itself, unless RF is enabled
#if RF
#define RINCR(z,c) (z.c++)
#else
#define RINCR DINCR
#endif
#define R_AREA(a) (a * T1 + RF-1)
// research shapes
#if R == 0 || R == 1
#define FOR_RESEARCH(r) S_1X1(r)
const int r_area = R_AREA(1);
#elif RS == 7
#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y))
const int r_area = R_AREA(S_PLUS_A(hr,R));
#elif RS == 6
#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y))
const int r_area = R_AREA(R*R);
#elif RS == 5
#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x))
const int r_area = R_AREA(S_TRIANGLE_A(hr,hr));
#elif RS == 4
#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x))
const int r_area = R_AREA(S_TRIANGLE_A(hr,R));
#elif RS == 3
#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y))
const int r_area = R_AREA(S_DIAMOND_A(hr,R));
#elif RS == 2
#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y))
const int r_area = R_AREA(R);
#elif RS == 1
#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x))
const int r_area = R_AREA(R);
#elif RS == 0
#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y))
const int r_area = R_AREA(R*R);
#endif
#define RI1 (RI+1)
#define RFI1 (RFI+1)
#if RI
#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1)
#else
#define FOR_ROTATION
#endif
#if RFI
#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++)
#else
#define FOR_REFLECTION
#endif
#if PD
#define PINCR DINCR
#else
#define PINCR(z,c) (z.c++)
#endif
#define P_AREA(a) (a - PD)
// patch shapes
#if P == 0 || P == 1
#define FOR_PATCH(p) S_1X1(p)
const int p_area = P_AREA(1);
#elif PS == 7
#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y))
const int p_area = P_AREA(S_PLUS_A(hp,P));
#elif PS == 6
#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y))
const int p_area = P_AREA(P*P);
#elif PS == 5
#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x))
const int p_area = P_AREA(S_TRIANGLE_A(hp,hp));
#elif PS == 4
#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x))
const int p_area = P_AREA(S_TRIANGLE_A(hp,P));
#elif PS == 3
#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y))
const int p_area = P_AREA(S_DIAMOND_A(hp,P));
#elif PS == 2
#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y))
const int p_area = P_AREA(P);
#elif PS == 1
#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x))
const int p_area = P_AREA(P);
#elif PS == 0
#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y))
const int p_area = P_AREA(P*P);
#endif
const float r_scale = 1.0/r_area;
const float p_scale = 1.0/p_area;
#define load_(off) HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
#if RF && defined(LUMA_raw)
#define load2_(off) RF_LUMA_tex(RF_LUMA_pos + RF_LUMA_pt * vec2(off))
#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
#elif RF && D1W
#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
#elif RF
#define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off))
#else
#define load2_(off) HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off))
#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
#endif
#if T
vec4 load(vec3 off)
{
switch (int(off.z)) {
case 0: return load_(off);
}
}
vec4 load2(vec3 off)
{
switch (int(off.z)) {
case 0: return load2_(off);
}
}
#else
#define load(off) load_(off)
#define load2(off) load2_(off)
#endif
vec4 poi = load(vec3(0)); // pixel-of-interest
vec4 poi2 = load2(vec3(0)); // guide pixel-of-interest
#if RI // rotation
vec2 rot(vec2 p, float d)
{
return vec2(
p.x * cos(radians(d)) - p.y * sin(radians(d)),
p.y * sin(radians(d)) + p.x * cos(radians(d))
);
}
#else
#define rot(p, d) (p)
#endif
#if RFI // reflection
vec2 ref(vec2 p, int d)
{
switch (d) {
case 0: return p;
case 1: return p * vec2(1, -1);
case 2: return p * vec2(-1, 1);
}
}
#else
#define ref(p, d) (p)
#endif
vec4 patch_comparison(vec3 r, vec3 r2)
{
vec3 p;
vec4 min_rot = vec4(p_area);
FOR_ROTATION FOR_REFLECTION {
vec4 pdiff_sq = vec4(0);
FOR_PATCH(p) {
vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z);
vec4 diff_sq = pow(load2(p + r2) - load2((transformed_p + r) * SF), vec4(2));
#if PST && P >= PST
float pdist = exp(-pow(length(p.xy*PSD)*PSS, 2));
diff_sq = pow(max(diff_sq, EPSILON), vec4(pdist));
#endif
pdiff_sq += diff_sq;
}
min_rot = min(min_rot, pdiff_sq);
}
return min_rot * p_scale;
}
#define NO_GATHER (PD == 0) // never textureGather if any of these conditions are false
#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER
// 3x3 diamond/plus patch_comparison_gather
const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
vec4 poi_patch = gather_offs(0, offsets);
vec4 patch_comparison_gather(vec3 r, vec3 r2)
{
float min_rot = p_area - 1;
vec4 transformer = gather_offs(r, offsets_sf);
FOR_ROTATION {
FOR_REFLECTION {
float diff_sq = dot(pow(poi_patch - transformer, vec4(2)), vec4(1));
min_rot = min(diff_sq, min_rot);
#if RFI
switch(rfi) {
case 0: transformer = transformer.zyxw; break;
case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror
case 2: transformer = transformer.zyxw; break; // undoes last mirror
}
#endif
}
#if RI == 3
transformer = transformer.wxyz;
#elif RI == 1
transformer = transformer.zwxy;
#endif
}
return vec4(min_rot + pow(poi2.x - load2(r).x, 2), 0, 0, 0) * p_scale;
}
#elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER
// tiled even square patch_comparison_gather
vec4 patch_comparison_gather(vec3 r, vec3 r2)
{
vec2 tile;
float min_rot = p_area;
/* gather order:
* w z
* x y
*/
FOR_ROTATION FOR_REFLECTION {
float pdiff_sq = 0;
for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
vec4 poi_patch = gather(tile + r2.xy);
vec4 transformer = gather(ref(rot(tile + 0.5, ri), rfi) - 0.5 + r.xy);
#if RI
for (float i = 0; i < ri; i+=90)
transformer = transformer.wxyz; // rotate 90 degrees
#endif
#if RFI // XXX output is a little off
switch(rfi) {
case 1: transformer = transformer.zyxw; break;
case 2: transformer = transformer.xwzy; break;
}
#endif
vec4 diff_sq = pow(poi_patch - transformer, vec4(2));
#if PST && P >= PST
vec4 pdist = vec4(
exp(-pow(length((tile+vec2(0,1))*PSD)*PSS, 2)),
exp(-pow(length((tile+vec2(1,1))*PSD)*PSS, 2)),
exp(-pow(length((tile+vec2(1,0))*PSD)*PSS, 2)),
exp(-pow(length((tile+vec2(0,0))*PSD)*PSS, 2))
);
diff_sq = pow(max(diff_sq, EPSILON), pdist);
#endif
pdiff_sq += dot(diff_sq, vec4(1));
}
min_rot = min(min_rot, pdiff_sq);
}
return vec4(min_rot, 0, 0, 0) * p_scale;
}
#else
#define patch_comparison_gather patch_comparison
#endif
vec4 hook()
{
vec4 total_weight = vec4(0);
vec4 sum = vec4(0);
vec4 result = vec4(0);
vec3 r = vec3(0);
vec3 p = vec3(0);
vec3 me = vec3(0);
#if T && ME == 1 // temporal & motion estimation
vec3 me_tmp = vec3(0);
float maxweight = 0;
#elif T && ME == 2 // temporal & motion estimation
vec3 me_sum = vec3(0);
float me_weight = 0;
#endif
#if WD == 2 || M == 3 // weight discard, weighted median intensities
int r_index = 0;
vec4 all_weights[r_area];
vec4 all_pixels[r_area];
#elif WD == 1 // weight discard
vec4 no_weights = vec4(0);
vec4 discard_total_weight = vec4(0);
vec4 discard_sum = vec4(0);
#endif
#if M == 1 // Euclidean medians
vec4 minsum = vec4(0);
#endif
FOR_FRAME(r) {
#if T && ME == 1 // temporal & motion estimation max weight
if (r.z > 0) {
me += me_tmp;
me_tmp = vec3(0);
maxweight = 0;
}
#elif T && ME == 2 // temporal & motion estimation weighted average
if (r.z > 0) {
me += round(me_sum / me_weight);
me_sum = vec3(0);
me_weight = 0;
}
#endif
FOR_RESEARCH(r) {
// main NLM logic
const float h = S*0.013;
const float pdiff_scale = 1.0/(h*h);
vec4 pdiff_sq = (r.z == 0) ? patch_comparison_gather(r+me, vec3(0)) : patch_comparison(r+me, vec3(0));
vec4 weight = exp(-pdiff_sq * pdiff_scale);
#if T && ME == 1 // temporal & motion estimation max weight
me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
maxweight = max(maxweight, weight.x);
#elif T && ME == 2 // temporal & motion estimation weighted average
me_sum += vec3(r.xy,0) * weight.x;
me_weight += weight.x;
#endif
#if D1W
weight = vec4(weight.x);
#endif
weight *= exp(-pow(length(r*SD)*SS, 2)); // spatial kernel
#if WD == 2 || M == 3 // weight discard, weighted median intensity
all_weights[r_index] = weight;
all_pixels[r_index] = load(r+me);
r_index++;
#elif WD == 1 // weight discard
vec4 wd_scale = 1.0/max(no_weights, 1);
vec4 keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
discard_sum += load(r+me) * weight * (1 - keeps);
discard_total_weight += weight * (1 - keeps);
no_weights += keeps;
#endif
sum += load(r+me) * weight;
total_weight += weight;
#if M == 1 // Euclidean median
// Based on: https://arxiv.org/abs/1207.3056
// XXX might not work with ME
vec3 r2;
vec4 wpdist_sum = vec4(0);
FOR_FRAME(r2) FOR_RESEARCH(r2) {
vec4 pdist = (r.z + r2.z) == 0 ? patch_comparison_gather(r+me, r2+me) : patch_comparison(r+me, r2+me);
wpdist_sum += sqrt(pdist) * (1-weight);
}
vec4 newmin = step(wpdist_sum, minsum); // wpdist_sum <= minsum
newmin *= 1 - step(wpdist_sum, vec4(0)); // && wpdist_sum > 0
newmin += step(minsum, vec4(0)); // || minsum <= 0
newmin = min(newmin, 1);
minsum = (newmin * wpdist_sum) + ((1-newmin) * minsum);
result = (newmin * load(r+me)) + ((1-newmin) * result);
#endif
} // FOR_RESEARCH
} // FOR_FRAME
#if T // temporal
#endif
vec4 avg_weight = total_weight * r_scale;
vec4 old_avg_weight = avg_weight;
#if WD == 2 // true average
total_weight = vec4(0);
sum = vec4(0);
vec4 no_weights = vec4(0);
for (int i = 0; i < r_area; i++) {
vec4 keeps = step(avg_weight*WDT, all_weights[i]);
all_weights[i] *= keeps;
sum += all_pixels[i] * all_weights[i];
total_weight += all_weights[i];
no_weights += keeps;
}
#elif WD == 1 // moving cumulative average
total_weight -= discard_total_weight;
sum -= discard_sum;
#endif
#if WD // weight discard
avg_weight = total_weight / no_weights;
#endif
total_weight += SW;
sum += poi * SW;
#if M == 3 // weighted median intensity
const float hr_area = r_area/2.0;
vec4 is_median, gt, lt, gte, lte, neq;
for (int i = 0; i < r_area; i++) {
gt = lt = vec4(0);
for (int j = 0; j < r_area; j++) {
gte = step(all_pixels[i]*all_weights[i], all_pixels[j]*all_weights[j]);
lte = step(all_pixels[j]*all_weights[j], all_pixels[i]*all_weights[i]);
neq = 1 - gte * lte;
gt += gte * neq;
lt += lte * neq;
}
is_median = step(gt, vec4(hr_area)) * step(lt, vec4(hr_area));
result += step(result, vec4(0)) * is_median * all_pixels[i];
}
#elif M == 2 // weight map
result = avg_weight;
#elif M == 0 // mean
result = sum / total_weight;
#endif
#if AS == 1 // sharpen+denoise
vec4 sharpened = result + (poi - result) * ASF;
vec4 sharpening_power = pow(avg_weight, vec4(ASP));
#elif AS == 2 // sharpen only
vec4 sharpened = poi + (poi - result) * ASF;
vec4 sharpening_power = pow(avg_weight, vec4(ASP));
#endif
#if EP // extremes preserve
float luminance = EP_texOff(0).x;
// EPSILON is needed since pow(0,0) is undefined
float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP));
result = mix(poi, result, ep_weight);
#endif
#if AS == 1 // sharpen+denoise
result = mix(sharpened, result, sharpening_power);
#elif AS == 2 // sharpen only
result = mix(sharpened, poi, sharpening_power);
#endif
return mix(poi, result, BF);
}