-
-
Save sad1e/42fb2e59d2201db2651623fba9a15a9f to your computer and use it in GitHub Desktop.
Revisions
-
atyuwen revised this gist
Apr 4, 2025 . 1 changed file with 134 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,134 @@ //============================================================================================================================== // An optimized AMD FSR 1.0 implementation for Mobiles // EASU and RCAS are combined in a single pass. // Based on https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/master/ffx-fsr/ffx_fsr1.h // Details can be found: https://atyuwen.github.io/posts/optimizing-fsr/ // Distributed under the MIT License. Copyright (c) 2021 atyuwen. // -- FsrEasuSampleH should be implemented by calling shader, like following: // AH3 FsrEasuSampleH(AF2 p) { return MyTex.SampleLevel(LinearSampler, p, 0).xyz; } //============================================================================================================================== void FsrMobile( out AH3 pix, AF2 ip, AF4 con0, AF4 con1, AF4 con2, AF4 con3){ //------------------------------------------------------------------------------------------------------------------------------ // Direction is the '+' diff. // A // B C D // E AF2 pp=(ip)*(con0.xy)+(con0.zw); AF2 tc=(pp+AF2_(0.5))*con1.xy; AH3 sC=FsrEasuSampleH(tc); #if 0 // Set to 1 to make FSR only affect the screen's central region. if (any(abs(tc-0.5)>AF1(0.75/2.0))) { pix=sC; return; } #endif AH3 sA=FsrEasuSampleH(tc-AF2(0, con1.y)); AH3 sB=FsrEasuSampleH(tc-AF2(con1.x, 0)); AH3 sD=FsrEasuSampleH(tc+AF2(con1.x, 0)); AH3 sE=FsrEasuSampleH(tc+AF2(0, con1.y)); //------------------------------------------------------------------------------------------------------------------------------ // Combined RCAS: Min and max of ring. AH1 mn4R=min(AMin3H1(sA.r,sB.r,sD.r),sE.r); AH1 mn4G=min(AMin3H1(sA.g,sB.g,sD.g),sE.g); AH1 mn4B=min(AMin3H1(sA.b,sB.b,sD.b),sE.b); AH1 mx4R=max(AMax3H1(sA.r,sB.r,sD.r),sE.r); AH1 mx4G=max(AMax3H1(sA.g,sB.g,sD.g),sE.g); AH1 mx4B=max(AMax3H1(sA.b,sB.b,sD.b),sE.b); // Immediate constants for peak range. AH2 peakC=AH2(1.0,-1.0*4.0); // Limiters, these need to be high precision RCPs. AH1 hitMinR=mn4R*ARcpH1(AH1_(4.0)*mx4R); AH1 hitMinG=mn4G*ARcpH1(AH1_(4.0)*mx4G); AH1 hitMinB=mn4B*ARcpH1(AH1_(4.0)*mx4B); AH1 hitMaxR=(peakC.x-mx4R)*ARcpH1(AH1_(4.0)*mn4R+peakC.y); AH1 hitMaxG=(peakC.x-mx4G)*ARcpH1(AH1_(4.0)*mn4G+peakC.y); AH1 hitMaxB=(peakC.x-mx4B)*ARcpH1(AH1_(4.0)*mn4B+peakC.y); AH1 lobeR=max(-hitMinR,hitMaxR); AH1 lobeG=max(-hitMinG,hitMaxG); AH1 lobeB=max(-hitMinB,hitMaxB); AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH1_(con0.x); // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. AH1 rcpL=ARcpH1(AH1_(4.0)*lobe+AH1_(1.0)); AH3 contrast=(lobe*sA+lobe*sB+lobe*sD+lobe*sE)*rcpL; //------------------------------------------------------------------------------------------------------------------------------ AH1 lA=sA.r*AH1_(0.5)+sA.g; AH1 lB=sB.r*AH1_(0.5)+sB.g; AH1 lC=sC.r*AH1_(0.5)+sC.g; AH1 lD=sD.r*AH1_(0.5)+sD.g; AH1 lE=sE.r*AH1_(0.5)+sE.g; // Then takes magnitude from abs average of both sides of 'C'. // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms. AH1 dc=lD-lC; AH1 cb=lC-lB; AH1 lenX=max(abs(dc),abs(cb)); lenX=ARcpH1(lenX); AH1 dirX=lD-lB; lenX=ASatH1(abs(dirX)*lenX); lenX*=lenX; // Repeat for the y axis. AH1 ec=lE-lC; AH1 ca=lC-lA; AH1 lenY=max(abs(ec),abs(ca)); lenY=ARcpH1(lenY); AH1 dirY=lE-lA; lenY=ASatH1(abs(dirY)*lenY); AH1 len = lenY * lenY + lenX; AH2 dir = AH2(dirX, dirY); //------------------------------------------------------------------------------------------------------------------------------ AH2 dir2=dir*dir; AH1 dirR=dir2.x+dir2.y; if (dirR<AH1_(1.0/64.0)) { pix=contrast+sC*rcpL; return; } dirR=ARsqH1(dirR); dir*=AH2_(dirR); len=len*AH1_(0.5); len*=len; AH1 stretch=(dir.x*dir.x+dir.y*dir.y)*ARcpH1(max(abs(dir.x),abs(dir.y))); AH2 len2=AH2(AH1_(1.0)+(stretch-AH1_(1.0))*len,AH1_(1.0)+AH1_(-0.5)*len); AH1 lob=AH1_(0.5)+AH1_((1.0/4.0-0.04)-0.5)*len; AH1 clp=ARcpH1(lob); //------------------------------------------------------------------------------------------------------------------------------ AF2 fp=floor(pp); pp-=fp; AH2 ppp=AH2(pp); AF2 p0=fp*(con1.xy)+(con1.zw); AF2 p1=p0+(con2.xy); AF2 p2=p0+(con2.zw); AF2 p3=p0+(con3.xy); p0.y-=con1.w; p3.y+=con1.w; AH4 fgcbR=FsrEasuRH(p0); AH4 fgcbG=FsrEasuGH(p0); AH4 fgcbB=FsrEasuBH(p0); AH4 ijfeR=FsrEasuRH(p1); AH4 ijfeG=FsrEasuGH(p1); AH4 ijfeB=FsrEasuBH(p1); AH4 klhgR=FsrEasuRH(p2); AH4 klhgG=FsrEasuGH(p2); AH4 klhgB=FsrEasuBH(p2); AH4 nokjR=FsrEasuRH(p3); AH4 nokjG=FsrEasuGH(p3); AH4 nokjB=FsrEasuBH(p3); //------------------------------------------------------------------------------------------------------------------------------ // This part is different for FP16, working pairs of taps at a time. AH2 pR=AH2_(0.0); AH2 pG=AH2_(0.0); AH2 pB=AH2_(0.0); AH2 pW=AH2_(0.0); FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 0.0)-ppp.xx,AH2(-1.0,-1.0)-ppp.yy,dir,len2,lob,clp,fgcbR.zw,fgcbG.zw,fgcbB.zw); FsrEasuTapH(pR,pG,pB,pW,AH2(-1.0, 0.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,ijfeR.xy,ijfeG.xy,ijfeB.xy); FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0,-1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,ijfeR.zw,ijfeG.zw,ijfeB.zw); FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 2.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,klhgR.xy,klhgG.xy,klhgB.xy); FsrEasuTapH(pR,pG,pB,pW,AH2( 2.0, 1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,klhgR.zw,klhgG.zw,klhgB.zw); FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0, 1.0)-ppp.xx,AH2( 2.0, 2.0)-ppp.yy,dir,len2,lob,clp,nokjR.xy,nokjG.xy,nokjB.xy); AH3 aC=AH3(pR.x+pR.y,pG.x+pG.y,pB.x+pB.y); AH1 aW=pW.x+pW.y; //------------------------------------------------------------------------------------------------------------------------------ pix=contrast+aC*AH3_(ARcpH1(aW)*rcpL);} -
atyuwen renamed this gist
Apr 4, 2025 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
atyuwen revised this gist
Sep 13, 2021 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,9 +2,9 @@ // An optimized AMD FSR's EASU implementation for Mobiles // Based on https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/master/ffx-fsr/ffx_fsr1.h // Details can be found: https://atyuwen.github.io/posts/optimizing-fsr/ // Distributed under the MIT License. Copyright (c) 2021 atyuwen. // -- FsrEasuSampleH should be implemented by calling shader, like following: // AH3 FsrEasuSampleH(AF2 p) { return MyTex.SampleLevel(LinearSampler, p, 0).xyz; } //============================================================================================================================== void FsrEasuL( out AH3 pix, -
atyuwen revised this gist
Sep 2, 2021 . 1 changed file with 3 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -55,14 +55,14 @@ pix = sC; return; } dirR=ARsqH1(dirR); dir*=AH2_(dirR); len=len*AH1_(0.5); len*=len; AH1 stretch=(dir.x*dir.x+dir.y*dir.y)*ARcpH1(max(abs(dir.x),abs(dir.y))); AH2 len2=AH2(AH1_(1.0)+(stretch-AH1_(1.0))*len,AH1_(1.0)+AH1_(-0.5)*len); AH1 lob=AH1_(0.5)+AH1_((1.0/4.0-0.04)-0.5)*len; AH1 clp=ARcpH1(lob); //------------------------------------------------------------------------------------------------------------------------------ AF2 fp=floor(pp); pp-=fp; -
atyuwen revised this gist
Aug 27, 2021 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,7 +1,8 @@ //============================================================================================================================== // An optimized AMD FSR's EASU implementation for Mobiles // Based on https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/master/ffx-fsr/ffx_fsr1.h // Details can be found: https://atyuwen.github.io/posts/optimizing-fsr/ // Distributed under the MIT License. // -- FsrEasuSampleH should be implemented by calling shader, like following: // half3 FsrEasuSampleH(float2 p) { return MyTex.SampleLevel(LinearSampler, p, 0).xyz; } //============================================================================================================================== -
atyuwen revised this gist
Aug 27, 2021 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,7 @@ //============================================================================================================================== // An optimized AMD FSR's EASU implementation for Mobiles // Based on https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/master/ffx-fsr/ffx_fsr1.h // Distributed under the MIT License. // -- FsrEasuSampleH should be implemented by calling shader, like following: // half3 FsrEasuSampleH(float2 p) { return MyTex.SampleLevel(LinearSampler, p, 0).xyz; } //============================================================================================================================== @@ -97,5 +98,4 @@ AH3 aC=AH3(pR.x+pR.y,pG.x+pG.y,pB.x+pB.y); AH1 aW=pW.x+pW.y; //------------------------------------------------------------------------------------------------------------------------------ pix=aC*AH3_(ARcpH1(aW));} -
atyuwen created this gist
Aug 25, 2021 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,101 @@ //============================================================================================================================== // An optimized AMD FSR's EASU implementation for Mobiles // Based on https://github.com/GPUOpen-Effects/FidelityFX-FSR/blob/master/ffx-fsr/ffx_fsr1.h // -- FsrEasuSampleH should be implemented by calling shader, like following: // half3 FsrEasuSampleH(float2 p) { return MyTex.SampleLevel(LinearSampler, p, 0).xyz; } //============================================================================================================================== void FsrEasuL( out AH3 pix, AF2 ip, AF4 con0, AF4 con1, AF4 con2, AF4 con3){ //------------------------------------------------------------------------------------------------------------------------------ // Direction is the '+' diff. // A // B C D // E AF2 pp=(ip)*(con0.xy)+(con0.zw); AF2 tc=(pp+AF2_(0.5))*con1.xy; AH3 sA=FsrEasuSampleH(tc-AF2(0, con1.y)); AH3 sB=FsrEasuSampleH(tc-AF2(con1.x, 0)); AH3 sC=FsrEasuSampleH(tc); AH3 sD=FsrEasuSampleH(tc+AF2(con1.x, 0)); AH3 sE=FsrEasuSampleH(tc+AF2(0, con1.y)); AH1 lA=sA.r*AH1_(0.5)+sA.g; AH1 lB=sB.r*AH1_(0.5)+sB.g; AH1 lC=sC.r*AH1_(0.5)+sC.g; AH1 lD=sD.r*AH1_(0.5)+sD.g; AH1 lE=sE.r*AH1_(0.5)+sE.g; // Then takes magnitude from abs average of both sides of 'C'. // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms. AH1 dc=lD-lC; AH1 cb=lC-lB; AH1 lenX=max(abs(dc),abs(cb)); lenX=ARcpH1(lenX); AH1 dirX=lD-lB; lenX=ASatH1(abs(dirX)*lenX); lenX*=lenX; // Repeat for the y axis. AH1 ec=lE-lC; AH1 ca=lC-lA; AH1 lenY=max(abs(ec),abs(ca)); lenY=ARcpH1(lenY); AH1 dirY=lE-lA; lenY=ASatH1(abs(dirY)*lenY); AH1 len = lenY * lenY + lenX; AH2 dir = AH2(dirX, dirY); //------------------------------------------------------------------------------------------------------------------------------ AH2 dir2=dir*dir; AH1 dirR=dir2.x+dir2.y; if (dirR<AH1_(1.0/64.0)) { pix = sC; return; } dirR=rsqrt(dirR); dir*=AH2_(dirR); len=len*AH1_(0.5); len*=len; AH1 stretch=(dir.x*dir.x+dir.y*dir.y)*rcp(max(abs(dir.x),abs(dir.y))); AH2 len2=AH2(AH1_(1.0)+(stretch-AH1_(1.0))*len,AH1_(1.0)+AH1_(-0.5)*len); AH1 lob=AH1_(0.5)+AH1_((1.0/4.0-0.04)-0.5)*len; AH1 clp=rcp(lob); //------------------------------------------------------------------------------------------------------------------------------ AF2 fp=floor(pp); pp-=fp; AH2 ppp=AH2(pp); AF2 p0=fp*(con1.xy)+(con1.zw); AF2 p1=p0+(con2.xy); AF2 p2=p0+(con2.zw); AF2 p3=p0+(con3.xy); p0.y-=con1.w; p3.y+=con1.w; AH4 fgcbR=FsrEasuRH(p0); AH4 fgcbG=FsrEasuGH(p0); AH4 fgcbB=FsrEasuBH(p0); AH4 ijfeR=FsrEasuRH(p1); AH4 ijfeG=FsrEasuGH(p1); AH4 ijfeB=FsrEasuBH(p1); AH4 klhgR=FsrEasuRH(p2); AH4 klhgG=FsrEasuGH(p2); AH4 klhgB=FsrEasuBH(p2); AH4 nokjR=FsrEasuRH(p3); AH4 nokjG=FsrEasuGH(p3); AH4 nokjB=FsrEasuBH(p3); //------------------------------------------------------------------------------------------------------------------------------ // This part is different for FP16, working pairs of taps at a time. AH2 pR=AH2_(0.0); AH2 pG=AH2_(0.0); AH2 pB=AH2_(0.0); AH2 pW=AH2_(0.0); FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 0.0)-ppp.xx,AH2(-1.0,-1.0)-ppp.yy,dir,len2,lob,clp,fgcbR.zw,fgcbG.zw,fgcbB.zw); FsrEasuTapH(pR,pG,pB,pW,AH2(-1.0, 0.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,ijfeR.xy,ijfeG.xy,ijfeB.xy); FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0,-1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,ijfeR.zw,ijfeG.zw,ijfeB.zw); FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 2.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,klhgR.xy,klhgG.xy,klhgB.xy); FsrEasuTapH(pR,pG,pB,pW,AH2( 2.0, 1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,klhgR.zw,klhgG.zw,klhgB.zw); FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0, 1.0)-ppp.xx,AH2( 2.0, 2.0)-ppp.yy,dir,len2,lob,clp,nokjR.xy,nokjG.xy,nokjB.xy); AH3 aC=AH3(pR.x+pR.y,pG.x+pG.y,pB.x+pB.y); AH1 aW=pW.x+pW.y; //------------------------------------------------------------------------------------------------------------------------------ // Slightly different for FP16 version due to combined min and max. pix=aC*AH3_(ARcpH1(aW));}