/* vi: ft=c * * Based on vf_nlmeans.c from FFmpeg. * * Copyright (c) 2022 an3223 * Copyright (c) 2016 Clément Bœsch * * This program is free software: you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 2.1 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ // Profile description: Luma only, saves some time by not processing the chroma plane. /* The recommended usage of this shader and its variant profiles is to add them * to input.conf and then dispatch the appropriate shader via a keybind during * media playback. Here is an example input.conf entry: * * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)" * * These shaders can also be enabled by default in mpv.conf, for example: * * glsl-shaders='~~/shaders/nlmeans.glsl' * * Both of the examples above assume the shaders are located in a subdirectory * named "shaders" within mpv's config directory. Refer to the mpv * documentation for more details. * * This shader is highly configurable via user variables below. Although the * default settings should offer good quality at a reasonable speed, you are * encouraged to tweak them to your preferences. Be mindful that certain * settings may greatly affect speed. * * Denoising is most useful for noisy content. If there is no perceptible * noise, you probably won't see a positive difference. * * The default settings are generally tuned for low noise and high detail * preservation. The "medium" and "heavy" profiles are tuned for higher levels * of noise. * * The denoiser will not work properly if the content has been upscaled * beforehand, whether it was done by you or someone down the line. Consider * issuing a command to downscale in the mpv console, like so: * * vf toggle scale=-2:720 * * ...replacing 720 with whatever resolution seems appropriate. Rerun the * command to undo the downscale. It may take some trial-and-error to find the * proper resolution. */ /* Regarding speed * * Speed may vary wildly for different vo and gpu-api settings. Generally * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this * may be different for your system. * * If your GPU doesn't support textureGather, or if you are on a version of mpv * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ and VLQ * profiles. * * textureGather is LUMA only and limited to the following configurations: * * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}:M!=1 * - Default, very fast, rotations and reflections should be free * - If this is unusually slow then try changing gpu-api and vo * - If it's still slow, try setting RI/RFI to 0. * * - PS=6:RI={0,1,3}:RFI={0,1,2} * - Currently the only scalable variant * - Patch shape is asymmetric on two axis * - Rotations should have very little speed impact * - Reflections may have a significant speed impact * * Options which always disable textureGather: * - RF * - PD */ // The following is shader code injected from guided_s.glsl /* vi: ft=c * * Copyright (c) 2022 an3223 * * This program is free software: you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 2.1 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ //desc: "Self-guided" guided filter /* The radius can be adjusted with the MEANIP stage's downscaling factor. * Higher numbers give a bigger radius. * * The E variable can be found in the A stage. * * The subsampling (fast guided filter) can be adjusted with the IP stage's * downscaling factor. Higher numbers are faster. */ //!HOOK LUMA //!DESC Guided filter (IP) //!BIND HOOKED //!WIDTH HOOKED.w 1.0 / //!HEIGHT HOOKED.h 1.0 / //!SAVE _INJ_IP vec4 hook() { return HOOKED_texOff(0); } //!HOOK LUMA //!DESC Guided filter (MEANIP) //!BIND _INJ_IP //!WIDTH _INJ_IP.w 2.0 / //!HEIGHT _INJ_IP.h 2.0 / //!SAVE _INJ_MEANIP vec4 hook() { return _INJ_IP_texOff(0); } //!HOOK LUMA //!DESC Guided filter (_INJ_IP_SQ) //!BIND _INJ_IP //!WIDTH _INJ_IP.w //!HEIGHT _INJ_IP.h //!SAVE _INJ_IP_SQ vec4 hook() { return _INJ_IP_texOff(0) * _INJ_IP_texOff(0); } //!HOOK LUMA //!DESC Guided filter (CORRIP) //!BIND _INJ_IP_SQ //!WIDTH _INJ_MEANIP.w //!HEIGHT _INJ_MEANIP.h //!SAVE _INJ_CORRIP vec4 hook() { return _INJ_IP_SQ_texOff(0); } //!HOOK LUMA //!DESC Guided filter (A) //!BIND _INJ_MEANIP //!BIND _INJ_CORRIP //!WIDTH _INJ_IP.w //!HEIGHT _INJ_IP.h //!SAVE _INJ_A #define E 0.001 vec4 hook() { vec4 var = _INJ_CORRIP_texOff(0) - _INJ_MEANIP_texOff(0) * _INJ_MEANIP_texOff(0); vec4 cov = var; return cov / (var + E); } //!HOOK LUMA //!DESC Guided filter (B) //!BIND _INJ_A //!BIND _INJ_MEANIP //!WIDTH _INJ_IP.w //!HEIGHT _INJ_IP.h //!SAVE _INJ_B vec4 hook() { return _INJ_MEANIP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANIP_texOff(0); } //!HOOK LUMA //!DESC Guided filter (MEANA) //!BIND _INJ_A //!WIDTH _INJ_MEANIP.w //!HEIGHT _INJ_MEANIP.h //!SAVE _INJ_MEANA vec4 hook() { return _INJ_A_texOff(0); } //!HOOK LUMA //!DESC Guided filter (MEANB) //!BIND _INJ_B //!WIDTH _INJ_MEANIP.w //!HEIGHT _INJ_MEANIP.h //!SAVE _INJ_MEANB vec4 hook() { return _INJ_B_texOff(0); } //!HOOK LUMA //!DESC Guided filter //!BIND HOOKED //!BIND _INJ_MEANA //!BIND _INJ_MEANB //!SAVE RF_LUMA vec4 hook() { return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0); } // End of source code injected from guided_s.glsl //!HOOK LUMA //!DESC Non-local means (downscale) //!WIDTH LUMA.w 3 / //!HEIGHT LUMA.h 3 / //!BIND LUMA //!SAVE EP vec4 hook() { return LUMA_texOff(0); } //!HOOK LUMA //!DESC Non-local means (share) //!BIND RF_LUMA //!SAVE RF vec4 hook() { return RF_LUMA_texOff(0); } //!HOOK LUMA //!BIND HOOKED //!BIND RF_LUMA //!BIND EP //!BIND RF //!DESC Non-local means (nlmeans_luma.glsl) /* User variables * * It is usually preferable to denoise chroma and luma differently, so the user * variables for luma and chroma are split. */ /* S = denoising factor * P = patch size * R = research size * * The denoising factor controls the level of blur, higher is blurrier. * * Patch size should usually be an odd number greater than or equal to 3. * Higher values are slower and not always better. * * Research size usually be an odd number greater than or equal to 3. Higher * values are usually better, but slower and offer diminishing returns. * * Even-numbered patch/research sizes will sample between pixels unless PS=6. * It's not known whether this is ever useful behavior or not. This is * incompatible with textureGather optimizations, so enable RF when using even * patch/research sizes. */ #ifdef LUMA_raw #define S 2.25 #define P 3 #define R 5 #else #define S 1.50 #define P 3 #define R 5 #endif /* Adaptive sharpening * * Uses the blur incurred by denoising plus the weight map to perform an * unsharp mask that gets applied most strongly to edges. * * Sharpening will amplify noise, so the denoising factor (S) should usually be * increased to compensate. * * AS: 2 for sharpening, 1 for sharpening+denoising, 0 to disable * ASF: Sharpening factor, higher numbers make a sharper underlying image * ASP: Weight power, higher numbers use more of the sharp image */ #ifdef LUMA_raw #define AS 0 #define ASF 1.0 #define ASP 2.0 #else #define AS 0 #define ASF 1.0 #define ASP 2.0 #endif /* Starting weight * * Lower numbers give less weight to the pixel-of-interest, which may help * handle higher noise levels, ringing, and may be useful for other things too? * * EPSILON should be used instead of zero to avoid divide-by-zero errors. The * avg_weight variable may be used to make SW adapt to the local noise level, * e.g., SW=max(avg_weight, EPSILON) */ #ifdef LUMA_raw #define SW 1.0 #else #define SW 1.0 #endif /* Weight discard * * Discard weights that fall below a fraction of the average weight. This culls * the most dissimilar samples from the blur, yielding a much more pleasant * result, especially around edges. * * WD: * - 2: True average. Very good quality, but slower and uses more memory. * - 1: Moving cumulative average. Inaccurate, tends to blur directionally. * - 0: Disable * * WDT: Threshold coefficient, higher numbers discard more * WDP (WD=1): Higher numbers reduce the threshold more for small sample sizes */ #ifdef LUMA_raw #define WD 2 #define WDT 1.0 #define WDP 6.0 #else #define WD 2 #define WDT 1.0 #define WDP 6.0 #endif /* Search shape * * Determines the shape of patches and research zones. Different shapes have * different speed and quality characteristics. Every shape (besides square) is * smaller than square. * * PS applies applies to patches, RS applies to research zones. * * 0: square (symmetrical) * 1: horizontal line (asymmetric) * 2: vertical line (asymmetric) * 3: diamond (symmetrical) * 4: triangle (asymmetric, pointing upward) * 5: truncated triangle (asymmetric on two axis, last row halved) * 6: even sized square (asymmetric on two axis) * 7: plus (symmetrical) */ #ifdef LUMA_raw #define RS 3 #define PS 3 #else #define RS 3 #define PS 3 #endif /* Rotational/reflectional invariance * * Number of rotations/reflections to try for each patch comparison. Slow, but * improves feature preservation, although adding more rotations/reflections * gives diminishing returns. The most similar rotation/reflection will be used. * * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc. * * RI: Rotational invariance * RFI (0 to 2): Reflectional invariance */ #ifdef LUMA_raw #define RI 3 #define RFI 2 #else #define RI 0 #define RFI 0 #endif /* Temporal denoising * * Caveats: * - Slower, each frame needs to be researched * - Requires vo=gpu-next and nlmeans_temporal.glsl * - Luma-only (this is a bug) * - Buggy * * Gather samples across multiple frames. May cause motion blur and may * struggle more with noise that persists across multiple frames (compression * noise, repeating frames), but can work very well on high quality video. * * Motion estimation (ME) should improve quality without impacting speed. * * T: number of frames used * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg */ #ifdef LUMA_raw #define T 0 #define ME 1 #else #define T 0 #define ME 0 #endif /* Spatial kernel * * Increasing the spatial denoising factor (SS) reduces the weight of further * pixels. * * Spatial distortion instructs the spatial kernel to view that axis as * closer/further, for instance SD=(1,1,0.5) would make the temporal axis * appear closer and increase blur between frames. * * The intra-patch variants do not yet have well-understood effects. They are * intended to make large patch sizes more useful. Likely slower. * * SS: spatial denoising factor * SD: spatial distortion (X, Y, time) * PSS: intra-patch spatial denoising factor * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables * PSD: intra-patch spatial distortion (X, Y) */ #ifdef LUMA_raw #define SS 0.25 #define SD vec3(1,1,1) #define PST 0 #define PSS 0.0 #define PSD vec2(1,1) #else #define SS 0.25 #define SD vec3(1,1,1) #define PST 0 #define PSS 0.0 #define PSD vec2(1,1) #endif /* Extremes preserve * * Reduces denoising around very bright/dark areas. The downscaling factor of * EP (located near the top of this shader) controls the area sampled for * luminance (higher numbers consider more area). * * EP: 1 to enable, 0 to disable * DP: EP strength on dark patches, 0 to fully denoise * BP: EP strength on bright patches, 0 to fully denoise */ #ifdef LUMA_raw #define EP 1 #define BP 0.75 #define DP 0.25 #else #define EP 0 #define BP 0.0 #define DP 0.0 #endif /* Robust filtering * * This setting is dependent on code generation from nlmeans_cfg, so this * setting can only be enabled via nlmeans_cfg. * * Compares the pixel-of-interest against downscaled pixels. * * This will virtually always improve quality, but will always disable * textureGather optimizations. * * The downscale factor can be modified in the WIDTH/HEIGHT directives for the * RF texture (for CHROMA, RGB) and RF_LUMA (LUMA only) textures near the top * of this shader, higher numbers increase blur. * * Any notation of RF as a positive number should be assumed to be referring to * the downscaling factor, e.g., RF=3 means RF is set to 1 and the downscaling * factor is set to 3. */ #ifdef LUMA_raw #define RF 1 #else #define RF 1 #endif /* Blur factor * * 0 to 1, only useful for alternative estimators. You're probably looking for * "S" (denoising factor), go back to the top of the shader! */ #ifdef LUMA_raw #define BF 1.0 #else #define BF 1.0 #endif /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ // Scaling factor (should match WIDTH/HEIGHT) #ifdef LUMA_raw #define SF 1 #else #define SF 1 #endif /* Estimator * * 0: means * 1: Euclidean medians (extremely slow, may be good for heavy noise) * 2: weight map (not a denoiser, maybe useful for generating image masks) * 3: weighted median intensity (slow, may be good for heavy noise) */ #ifdef LUMA_raw #define M 0 #else #define M 0 #endif // Patch donut (probably useless) #ifdef LUMA_raw #define PD 0 #else #define PD 0 #endif // Duplicate 1st weight #ifdef LUMA_raw #define D1W 0 #else #define D1W 0 #endif /* Shader code */ #define EPSILON 0.00000000001 #if PS == 6 const int hp = P/2; #else const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes #endif #if RS == 6 const int hr = R/2; #else const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes #endif // donut increment, increments without landing on (0,0,0) // much faster than a "continue" statement #define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0)))) // search shapes and their corresponding areas #define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++) #define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr) #define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr) #define S_TRIANGLE_A(hz,Z) int(pow(hz, 2)+Z) #define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr) #define S_DIAMOND_A(hz,Z) int(pow(hz, 2)*2+Z) #define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr) #define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++) #define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr) #define S_PLUS_A(hz,Z) (Z*2 - 1) #define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr) #define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr) #define T1 (T+1) #define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++) // Skip comparing the pixel-of-interest against itself, unless RF is enabled #if RF #define RINCR(z,c) (z.c++) #else #define RINCR DINCR #endif #define R_AREA(a) (a * T1 + RF-1) // research shapes #if R == 0 || R == 1 #define FOR_RESEARCH(r) S_1X1(r) const int r_area = R_AREA(1); #elif RS == 7 #define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y)) const int r_area = R_AREA(S_PLUS_A(hr,R)); #elif RS == 6 #define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y)) const int r_area = R_AREA(R*R); #elif RS == 5 #define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x)) const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); #elif RS == 4 #define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x)) const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); #elif RS == 3 #define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y)) const int r_area = R_AREA(S_DIAMOND_A(hr,R)); #elif RS == 2 #define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y)) const int r_area = R_AREA(R); #elif RS == 1 #define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x)) const int r_area = R_AREA(R); #elif RS == 0 #define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y)) const int r_area = R_AREA(R*R); #endif #define RI1 (RI+1) #define RFI1 (RFI+1) #if RI #define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1) #else #define FOR_ROTATION #endif #if RFI #define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++) #else #define FOR_REFLECTION #endif #if PD #define PINCR DINCR #else #define PINCR(z,c) (z.c++) #endif #define P_AREA(a) (a - PD) // patch shapes #if P == 0 || P == 1 #define FOR_PATCH(p) S_1X1(p) const int p_area = P_AREA(1); #elif PS == 7 #define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y)) const int p_area = P_AREA(S_PLUS_A(hp,P)); #elif PS == 6 #define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y)) const int p_area = P_AREA(P*P); #elif PS == 5 #define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x)) const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); #elif PS == 4 #define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x)) const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); #elif PS == 3 #define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y)) const int p_area = P_AREA(S_DIAMOND_A(hp,P)); #elif PS == 2 #define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y)) const int p_area = P_AREA(P); #elif PS == 1 #define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x)) const int p_area = P_AREA(P); #elif PS == 0 #define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y)) const int p_area = P_AREA(P*P); #endif const float r_scale = 1.0/r_area; const float p_scale = 1.0/p_area; #define load_(off) HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off)) #if RF && defined(LUMA_raw) #define load2_(off) RF_LUMA_tex(RF_LUMA_pos + RF_LUMA_pt * vec2(off)) #define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr))) #define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0) #elif RF && D1W #define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off)) #define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr))) #define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0) #elif RF #define load2_(off) RF_tex(RF_pos + RF_pt * vec2(off)) #else #define load2_(off) HOOKED_tex(HOOKED_pos + HOOKED_pt * vec2(off)) #define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr))) #define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0) #endif #if T vec4 load(vec3 off) { switch (int(off.z)) { case 0: return load_(off); } } vec4 load2(vec3 off) { switch (int(off.z)) { case 0: return load2_(off); } } #else #define load(off) load_(off) #define load2(off) load2_(off) #endif vec4 poi = load(vec3(0)); // pixel-of-interest vec4 poi2 = load2(vec3(0)); // guide pixel-of-interest #if RI // rotation vec2 rot(vec2 p, float d) { return vec2( p.x * cos(radians(d)) - p.y * sin(radians(d)), p.y * sin(radians(d)) + p.x * cos(radians(d)) ); } #else #define rot(p, d) (p) #endif #if RFI // reflection vec2 ref(vec2 p, int d) { switch (d) { case 0: return p; case 1: return p * vec2(1, -1); case 2: return p * vec2(-1, 1); } } #else #define ref(p, d) (p) #endif vec4 patch_comparison(vec3 r, vec3 r2) { vec3 p; vec4 min_rot = vec4(p_area); FOR_ROTATION FOR_REFLECTION { vec4 pdiff_sq = vec4(0); FOR_PATCH(p) { vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z); vec4 diff_sq = pow(load2(p + r2) - load2((transformed_p + r) * SF), vec4(2)); #if PST && P >= PST float pdist = exp(-pow(length(p.xy*PSD)*PSS, 2)); diff_sq = pow(max(diff_sq, EPSILON), vec4(pdist)); #endif pdiff_sq += diff_sq; } min_rot = min(min_rot, pdiff_sq); } return min_rot * p_scale; } #define NO_GATHER (PD == 0) // never textureGather if any of these conditions are false #define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3) #if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && M != 1 && REGULAR_ROTATIONS && NO_GATHER // 3x3 diamond/plus patch_comparison_gather const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) }; const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF }; vec4 poi_patch = gather_offs(0, offsets); vec4 patch_comparison_gather(vec3 r, vec3 r2) { float min_rot = p_area - 1; vec4 transformer = gather_offs(r, offsets_sf); FOR_ROTATION { FOR_REFLECTION { float diff_sq = dot(pow(poi_patch - transformer, vec4(2)), vec4(1)); min_rot = min(diff_sq, min_rot); #if RFI switch(rfi) { case 0: transformer = transformer.zyxw; break; case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror case 2: transformer = transformer.zyxw; break; // undoes last mirror } #endif } #if RI == 3 transformer = transformer.wxyz; #elif RI == 1 transformer = transformer.zwxy; #endif } return vec4(min_rot + pow(poi2.x - load2(r).x, 2), 0, 0, 0) * p_scale; } #elif (defined(LUMA_gather) || D1W) && PS == 6 && REGULAR_ROTATIONS && NO_GATHER // tiled even square patch_comparison_gather vec4 patch_comparison_gather(vec3 r, vec3 r2) { vec2 tile; float min_rot = p_area; /* gather order: * w z * x y */ FOR_ROTATION FOR_REFLECTION { float pdiff_sq = 0; for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) { vec4 poi_patch = gather(tile + r2.xy); vec4 transformer = gather(ref(rot(tile + 0.5, ri), rfi) - 0.5 + r.xy); #if RI for (float i = 0; i < ri; i+=90) transformer = transformer.wxyz; // rotate 90 degrees #endif #if RFI // XXX output is a little off switch(rfi) { case 1: transformer = transformer.zyxw; break; case 2: transformer = transformer.xwzy; break; } #endif vec4 diff_sq = pow(poi_patch - transformer, vec4(2)); #if PST && P >= PST vec4 pdist = vec4( exp(-pow(length((tile+vec2(0,1))*PSD)*PSS, 2)), exp(-pow(length((tile+vec2(1,1))*PSD)*PSS, 2)), exp(-pow(length((tile+vec2(1,0))*PSD)*PSS, 2)), exp(-pow(length((tile+vec2(0,0))*PSD)*PSS, 2)) ); diff_sq = pow(max(diff_sq, EPSILON), pdist); #endif pdiff_sq += dot(diff_sq, vec4(1)); } min_rot = min(min_rot, pdiff_sq); } return vec4(min_rot, 0, 0, 0) * p_scale; } #else #define patch_comparison_gather patch_comparison #endif vec4 hook() { vec4 total_weight = vec4(0); vec4 sum = vec4(0); vec4 result = vec4(0); vec3 r = vec3(0); vec3 p = vec3(0); vec3 me = vec3(0); #if T && ME == 1 // temporal & motion estimation vec3 me_tmp = vec3(0); float maxweight = 0; #elif T && ME == 2 // temporal & motion estimation vec3 me_sum = vec3(0); float me_weight = 0; #endif #if WD == 2 || M == 3 // weight discard, weighted median intensities int r_index = 0; vec4 all_weights[r_area]; vec4 all_pixels[r_area]; #elif WD == 1 // weight discard vec4 no_weights = vec4(0); vec4 discard_total_weight = vec4(0); vec4 discard_sum = vec4(0); #endif #if M == 1 // Euclidean medians vec4 minsum = vec4(0); #endif FOR_FRAME(r) { #if T && ME == 1 // temporal & motion estimation max weight if (r.z > 0) { me += me_tmp; me_tmp = vec3(0); maxweight = 0; } #elif T && ME == 2 // temporal & motion estimation weighted average if (r.z > 0) { me += round(me_sum / me_weight); me_sum = vec3(0); me_weight = 0; } #endif FOR_RESEARCH(r) { // main NLM logic const float h = S*0.013; const float pdiff_scale = 1.0/(h*h); vec4 pdiff_sq = (r.z == 0) ? patch_comparison_gather(r+me, vec3(0)) : patch_comparison(r+me, vec3(0)); vec4 weight = exp(-pdiff_sq * pdiff_scale); #if T && ME == 1 // temporal & motion estimation max weight me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); maxweight = max(maxweight, weight.x); #elif T && ME == 2 // temporal & motion estimation weighted average me_sum += vec3(r.xy,0) * weight.x; me_weight += weight.x; #endif #if D1W weight = vec4(weight.x); #endif weight *= exp(-pow(length(r*SD)*SS, 2)); // spatial kernel #if WD == 2 || M == 3 // weight discard, weighted median intensity all_weights[r_index] = weight; all_pixels[r_index] = load(r+me); r_index++; #elif WD == 1 // weight discard vec4 wd_scale = 1.0/max(no_weights, 1); vec4 keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight); discard_sum += load(r+me) * weight * (1 - keeps); discard_total_weight += weight * (1 - keeps); no_weights += keeps; #endif sum += load(r+me) * weight; total_weight += weight; #if M == 1 // Euclidean median // Based on: https://arxiv.org/abs/1207.3056 // XXX might not work with ME vec3 r2; vec4 wpdist_sum = vec4(0); FOR_FRAME(r2) FOR_RESEARCH(r2) { vec4 pdist = (r.z + r2.z) == 0 ? patch_comparison_gather(r+me, r2+me) : patch_comparison(r+me, r2+me); wpdist_sum += sqrt(pdist) * (1-weight); } vec4 newmin = step(wpdist_sum, minsum); // wpdist_sum <= minsum newmin *= 1 - step(wpdist_sum, vec4(0)); // && wpdist_sum > 0 newmin += step(minsum, vec4(0)); // || minsum <= 0 newmin = min(newmin, 1); minsum = (newmin * wpdist_sum) + ((1-newmin) * minsum); result = (newmin * load(r+me)) + ((1-newmin) * result); #endif } // FOR_RESEARCH } // FOR_FRAME #if T // temporal #endif vec4 avg_weight = total_weight * r_scale; vec4 old_avg_weight = avg_weight; #if WD == 2 // true average total_weight = vec4(0); sum = vec4(0); vec4 no_weights = vec4(0); for (int i = 0; i < r_area; i++) { vec4 keeps = step(avg_weight*WDT, all_weights[i]); all_weights[i] *= keeps; sum += all_pixels[i] * all_weights[i]; total_weight += all_weights[i]; no_weights += keeps; } #elif WD == 1 // moving cumulative average total_weight -= discard_total_weight; sum -= discard_sum; #endif #if WD // weight discard avg_weight = total_weight / no_weights; #endif total_weight += SW; sum += poi * SW; #if M == 3 // weighted median intensity const float hr_area = r_area/2.0; vec4 is_median, gt, lt, gte, lte, neq; for (int i = 0; i < r_area; i++) { gt = lt = vec4(0); for (int j = 0; j < r_area; j++) { gte = step(all_pixels[i]*all_weights[i], all_pixels[j]*all_weights[j]); lte = step(all_pixels[j]*all_weights[j], all_pixels[i]*all_weights[i]); neq = 1 - gte * lte; gt += gte * neq; lt += lte * neq; } is_median = step(gt, vec4(hr_area)) * step(lt, vec4(hr_area)); result += step(result, vec4(0)) * is_median * all_pixels[i]; } #elif M == 2 // weight map result = avg_weight; #elif M == 0 // mean result = sum / total_weight; #endif #if AS == 1 // sharpen+denoise vec4 sharpened = result + (poi - result) * ASF; vec4 sharpening_power = pow(avg_weight, vec4(ASP)); #elif AS == 2 // sharpen only vec4 sharpened = poi + (poi - result) * ASF; vec4 sharpening_power = pow(avg_weight, vec4(ASP)); #endif #if EP // extremes preserve float luminance = EP_texOff(0).x; // EPSILON is needed since pow(0,0) is undefined float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); result = mix(poi, result, ep_weight); #endif #if AS == 1 // sharpen+denoise result = mix(sharpened, result, sharpening_power); #elif AS == 2 // sharpen only result = mix(sharpened, poi, sharpening_power); #endif return mix(poi, result, BF); }