Last active
          March 10, 2025 12:49 
        
      - 
      
 - 
        
Save inikishev/7c446fe4ddfb5c7e5611498bf2b7b82c to your computer and use it in GitHub Desktop.  
Revisions
- 
        
inikishev revised this gist
Mar 10, 2025 . 1 changed file with 96 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -252,7 +252,7 @@ from .StructuredNGD_DL import KFACOptimizer, LocalOptimizer # Matrix-multiplication-only KFAC (Simplifying Momentum-based Positive-definite Submanifold Optimization) from .Muon import Muon # MomentUm Orthogonalized by Newton-schulz. from .orth_optim import orthogonalise @@ -532,4 +532,98 @@ # keeps the norm of each parameter vector fixed and mean at zero during the optimization process from .SOAP_MUON import SOAP_Muon # SOAP + Muon = SOAP_Muon from .psgd_kron_lra import KronLRA # LRA per kron factor from .psgd_kron_contraction import Kron # joint learning of Xilin Li's criterion 3 as well as Madeleine Udell's contraction factor on the Lie group from .Moonlight import Muon # Muon with lr normalization based on param size and maybe other stuff from .FedPD import PSVRG, PSGD, FedPD_SGD, FedPD_VR # Federated Primal-Dual Algorithm from .llmart import GreedyCoordinateGradient # greedy coordinate gradient from .EOPC import Rosen # Optimizing Mode Connectivity for Class Incremental Learning from .Autoenv import IEKF # iterative extended kalman filter optimizer from .fastr import FastrD, FastrN, STORMplus, StormPlus # Fully Adaptive STochastic Recursive-momentum from .NeuralNetwork import SLBI, SLBI2, SLBI_ADAM_ToolBox, SLBI_SGD_ToolBox from .DessiLBI import SLBI, SLBI_ToolBox # Exploring Structural Sparsity of Deep Networks via Inverse Scale Spaces from .dowg import DoWG, CDoWG # DoWG Unleashed: An Efficient Universal Parameter-Free Gradient Descent Method from .archai import CocobBackprop, CocobOns, Lamb # microsofts NAS lib from .coin_betting import SGDOL, Cocob, Recursive, Regralizer, Scinol2, ONSBet # Parameter-free coin betting optimizers from .dolphinflow import DolphinFlow # recent muon/adamw like has a bunch of settings to tune https://github.com/cognitivecomputations/dolphinflow-optimizer from .neosr import adamw_win, adan_sf, adamw_sf, adan, soap_sf, fsam # from super resolution lib and stuff adapted from heavyball from .recpre import SOAP, LionW, SophiaG, Lilith, ELLISAdam, IVON, ZeroShampooWithAdamGraftingOptimizer, OrthogonalNesterov # recurrent pretraining from .supertrainer2k import Adalite, Lilith # idk from .wu_nature_comms_2024 import NewStyleBatchFISTAOptim, NewStyleSingleFISTAOptim # something insane from .dd4ml import APTS,APTS_D, TrustRegion, TrustRegionLegacy # Additively preconditioned trust-region strategies for machine learning. requires some type of condig and some type of subdomain_optimizer from .koaning_io_more_descent_less_gradient import KeepStepping, KeepVaulting # keeps stepping on single batch or maybe it was supposed to be a line search idk from .CR import COMP # Compact representations for recursive Hessian matrix estimates (similar to LBFGS) from .MaxFactor import MaxFactor # utra recent from .scion import Scion # Training Deep Learning Models with Norm-Constrained LMOs. from .rapp import RAPPsgd, RAPPadam, ExtraAdagrad, ExtraAdam, ExtraSGD, EGplusAdam, EGplusSGD, LA, AdamLA, ExtraSGDLA, ExtraAdamLA, EGplusLA, EGplusAdamLA # Stable Nonconvex-Nonconcave Training via Linear Interpolation from .storm_plus import STORMplus # STORM+ from .AccSGD import AccSGD # On the insufficiency of existing momentum schemes for Stochastic Optimization from .AdaInject import AdaBelief, AdaBeliefInject, AdamInject, diffGrad, diffGradInject,Radam, RadamInject # AdaInject: Injection Based Adaptive Gradient Descent Optimizers for Convolutional Neural Networks from .PowerSign_and_AddSign import AddSign, PowerSign # https://github.com/Neoanarika/Implementing-the-PowerSign-and-AddSign-rule from .AddSign_PowerSign_in_PyTorch import AddSign, PowerSign, LinearInternalDecay, CosineInternalDecay, RestartCosineInternalDecay # https://github.com/cydonia999/AddSign_PowerSign_in_PyTorch # Neural Optimiser search with Reinforcment learning from .neumann_optimizer import Neumann, Neumann2 # https://github.com/jayybhatt/neumann-optimizer # A Practical Optimization Algorithm for Deep Neural Networks (implicitly computes the inverse Hessian of each mini-batch to produce descent directions) from .neural_search_optimizer import Optimizer_1 # https://github.com/daviddao/pytorch-neural-search-optimizer # Neural Optimizer Search's Optimizer_1  - 
        
inikishev revised this gist
Feb 10, 2025 . 1 changed file with 92 additions and 8 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,11 +2,14 @@ # ruff: noqa: F811 # ------------------------ OTHER ONES I HAVE INSTALLED ----------------------- # from pytorch_optimizer import ADOPT, AdaBelief #, ... from OPTAMI import GradientDescent, SimilarTriangles, CubicRegularizedNewton, BasicTensorMethod, DampedNewton, NesterovAcceleration, NearOptimalAcceleration, ProxPointSegmentSearch, NATA, Optimal from heavyball import ForeachSFAdamW, PaLMForeachSFAdamW, ForeachADOPT, ForeachMuon, ForeachLaProp, MuonLaProp, ForeachSOAP, PaLMForeachSOAP, PrecondScheduleForeachSOAP, PrecondSchedulePaLMForeachSOAP, ForeachPSGDKron, ForeachPurePSGD, ForeachCachedDelayedPSGDKron, ForeachCachedPSGDKron, ForeachDelayedPSGD from schedulefree import SGDScheduleFreeClosure, AdamWScheduleFreeClosure, RAdamScheduleFreeClosure, ScheduleFreeWrapper from timm.optim import AdaBelief, Adafactor #, ... # ----------------------------------- MINE ----------------------------------- # from torchzero.optim import Adagrad, AdamW #, ... # ----------------------------------- misc ----------------------------------- # from .Optimizer_PyTorch import AdaBound, AdaBoundW, Adam, ErrorFeedbackSGD, ExtraAdam, ExtraSGD, OptimisticAdam, OMD, SGD, Storm @@ -18,6 +21,8 @@ from .pyutils import Adam_GC,DAdaptAdam,DAdaptSGD,GLD,Lookahead,Prodigy,RAdam,SAM,SGD_GC,SMTP from .Best_Deep_Learning_Optimizers import madgrad_wd, Ranger, Sls, Adahessian, AdaMod, DeepMemory, DiffGrad, diffRGrad, DiffMod from .over9000 import AdaBelief, AdaMod, Adan, Apollo, DiffGrad, Lamb, Lookahead, Madam, MADGRAD, AdamW, RAdam, PlainRAdam, Novograd, Ralamb, Ranger, RangerLars from .cringe_live import AdaAbs, AdaptiveCompass, Clybius, Compass, DOPE, ExMachina, FARMSCropV3, FCompass, SAVEUS from .Personalized_Optimizers import FARMSCrop, FARMSCropV2, FCompass, FishMonger, FMARSCrop, FMARSCrop_ExMachina, FMARSCropV2 # ----------------------------------- repos ---------------------------------- # from .kron_torch import Kron @@ -67,6 +72,7 @@ from .torch_kfac import KFACOptimizer, EKFACOptimizer from .KFAC import KFAC, EKFAC, GKFAC from .torch_kfac2 import KFAC # MAYBE GOOD # Kronecker-Factored Approximate Curvature from .SGDPH.sgdph import sgdph @@ -424,7 +430,7 @@ # AdaSTE: An adaptive Straight-Through Estimator to Train Binary Neural Networks, Training Binary Neural Networks using the Bayesian Learning Rule from .alopex import Alopex # ALgorithm Of Pattern EXtraction (ALOPEX) 99/B version (gradient free) from .statopt import QHM, SALSA, SSLS, SASA, SLOPE # ???idk @@ -442,10 +448,88 @@ # Uncertainty Quantification with the Empirical Tangent Kernel from .SimuRLacra import GSS # Golden Section Search (I think this is gradient free and for univariate funcs) from .gcopt import GCOptimizer # Gaussian continuation optimizer (wraps another optimizer, and ultra-recent) from .k_fac import KFACOptimizer,KFACIDOptimizer,SKFACOptimizer,EKFACOptimizer,KBFGSOptimizer,KBFGSLOptimizer,KBFGSL2LOOPOptimizer,KBFGSLMEOptimizer,NGDOptimizer # biggest k-fac repo (i fixed all acc_stats) from .proxyprox import ProxyProx # konstmish's mysterious ProxyProx (has step as well as inner_step methods) from .SWANOptimizer import SWAN # SWAN (SGD with Whitening And Normalization) from .sparse_szo import DuelingEvolutionOptimizer, VanillaEvolutionOptimizer, OneSideEvolutionOptimizer, TwoSideEvolutionOptimizer, FirstOrderOptimizer, FirstOrderBanditOptimizer # Sparse Perturbations for Improved Convergence in Stochastic Zeroth-Order Optimization from .PSGD_Nuon import Nuon, AutoNuon # Use single sided whitening that is dynamic and learned instead of being instantanious like Muon from .coherent_gradients import RA3,RM3, M3 # Weak and Strong Gradient Directions: Explaining Memorization, Generalization, and Hardness of Examples at Scale from .eva import Eva, EvaExperimental, KFAC, AdaKFAC, AdaKFAC2, KFACSAM, MFAC, Shampoo # Eva: Practical Second-order Optimization with Kronecker-vectorized Approximation (pretty sure they modify the gradient and don't update params) from .natural_galore import SubSpaceAdamW # GaLore extension - Natural Gradient Descent in low rank subspace from .galore_torch import GaLoreAdafactor, GaLoreAdamW, GaLoreAdamW8bit # Memory-Efficient LLM Training by Gradient Low-Rank Projection from .compass_optimizer import CompassExperimental4Bit, CompassExperimental8Bit, CompassExperimental8BitBNB, Compasstic, LPFAdamW, AdamW, RMSProp # A modification of original ADAMW optimizer by replacing momentum moment with smoothing filter. from .sgd_sai import SGD_sai # No More Adam: Learning Rate Scaling at Initialization is All You Need from .unrl import EligibilityTraceOptimizer,KFACOptimizer # optimizers from Reinforcement Learning algorithms library from .second_order_optimization_NQS import SecondOrderOpt # Second-order Optimisation strategies for neural network quantum states from .ldadamw_torch import LDAdamW # Low-Dimensional Adam - Adaptive Optimization from Low-Dimensional Gradient Statistics. from .pydrsom import DRSOMB, DRSOMB2, DRSOMK, DRSOMVec, KDRSOM # dimension-reduced second-order method (DRSOM) from .AdaGL import AdaGL, FractionalSmoothLoss # deep learning optimizer that combines fractional-order calculus with adaptive techniques. Using Grünwald–Letnikov derivatives from .mkor import MKOR # Momentum-Enabled Kronecker-Factor-Based Optimizer Using Rank-1 Updates from .sn_sm import GenericOptim, AdamWSN, AdamWSNG # GenericOptim is maybe it, Subset-Norm and Subspace-Momentum: Faster Memory-Efficient Adaptive Optimization with Convergence Guarantees from .OptML_Project import Adasub, Adahessian # Comparison of second-order optimizers on transformers from .MARS import MARS, ADOPT, Muon, AdamW # MARS (Make vAriance Reduction Shine from .pytorch_velo import VeLO # learned optimizer LSTM (just a pytorch wrapper for jax optimizer) from .mctorch import ConjugateGradient # other optimizers from this are for manifolds only, this works on any layers from .modded_nanogpt import Kron # kron fork by evanatyourservice with recent changes from .smplifyx import TrustRegionNewtonCG, LBFGS # trust region newton cg from .widedeepnetworks import ESS, HMC # Gaussian Process Behaviour in Wide Deep Neural Networks (zeroth order?) from .alf import NeroPlus, AdamTF # keeps the norm of each parameter vector fixed and mean at zero during the optimization process from .SOAP_MUON import SOAP_Muon # SOAP + Muon = SOAP_Muon  - 
        
inikishev revised this gist
Dec 20, 2024 . 1 changed file with 163 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -16,6 +16,8 @@ from .moai import * # insane number of them from .collie import AdaLomo, Adan, Lion, Lomo, SophiaG from .pyutils import Adam_GC,DAdaptAdam,DAdaptSGD,GLD,Lookahead,Prodigy,RAdam,SAM,SGD_GC,SMTP from .Best_Deep_Learning_Optimizers import madgrad_wd, Ranger, Sls, Adahessian, AdaMod, DeepMemory, DiffGrad, diffRGrad, DiffMod from .over9000 import AdaBelief, AdaMod, Adan, Apollo, DiffGrad, Lamb, Lookahead, Madam, MADGRAD, AdamW, RAdam, PlainRAdam, Novograd, Ralamb, Ranger, RangerLars # ----------------------------------- repos ---------------------------------- # from .kron_torch import Kron @@ -223,7 +225,7 @@ from .pytorch_minimize import MinimizeWrapper, BasinHoppingWrapper, DualAnnealingWrapper, DifferentialEvolutionWrapper # scipy minimize (ha ha mine is better) from .geoopt import SGRHMC, RHMC, RSGLD, RiemannianAdam, RiemannianLineSearch, RiemannianSGD, SparseRiemannianAdam, SparseRiemannianSGD # Riemannian Adaptive Optimization Methods (maybe only works on geoopt layers idk) from .pykg2vec import RiemannianOptimizer @@ -287,4 +289,163 @@ # noise stability optimization algorithm, Hessian-based regularization approach for finding flat minima (NSM) from .Exponentiated_Gradient import EGPM # exponentiated gradient (EG) algorithm and plus-minus variant from .zeroptim import MeZO, SmartES # zero-order optimization techniques from .GDPolyak import GDPolyak # Gradient descent with adaptive stepsize converges (nearly) linearly under fourth-order growth from .APROX import Truncated, TruncatedAdagrad # APROX: Robust Stochastic Optimization Algorithms from .SVRG_Pytorch import SVRG # efficient variant of SVRG that relies on mini-batching implemented in Pytorch from .poincare_embeddings import RiemannianSGD # actually working riemannian SGD from .tram_optimizer import TRAM # Trust Region Aware Minimization from .gsam import GSAM # Surrogate Gap Guided Sharpness-Aware Minimization from .ReinventingWheel import FTRLP # FTRL-proximal algorithm (Follow-the-Regularized-Leader and Mirror Descent: Equivalence Theorems and L1 Regularization, H. B. Mcmahan. AISTATS 2011.) from .OLoptim import FTML, FTRL_Proximal, OSD, SGDOL_global, SGD_globLR, STORM # Online & Stochastic optimization algorithms for deep learning from .metaopt import SGD_Multi_LR, SGD_Quotient_LR # Online hyperparameter optimization by real-time recurrent learning from .core_optimizer import CoRe # Continual Resilient (CoRe) Optimizer from .Seminar import Ada_Grad, FTRL, nAda_Grad, nFTRL, nKT, nOGD, OGD # "Implementation of different algorithms and their normalized counterparts in the pytorch framework" from .Recommendation_System_Method_Reproduce import FTRL from .Code import FTRL, OBC from .ftrl import FTRL from .DP_FTRL import FTRLOptimizer # official implementation by Google # Follow-the-Regularized-Leader from .smart_grid import AdaX # AdaX: Adaptive Gradient Descent with Exponential Long Term Memory from .nerf_atlas import UniformAdam # something crazy with solves and laplacian matrix?? from .mlopt import Adahessian, Goldstein, Normalized_Optimizer, OneBit_Adam, SAM, Alternate_SAM, Alternate_SAM_v2, Alternate_SAM_v3, AdamS_v1, ASAM_ON, Sketch_Adam, SophiaG, Sophus, GN_DOM_SGD, GN_BULK_SGD, DOM_SGD, BULK_SGD # crazy stuff (no descriptions) from .subgd import PCAOptimizer # Few-Shot Learning by Dimensionality Reduction in Gradient Space (needs some kind of config) from .RFR_NeurIPS23 import RFR # robust fairness regularization (RFR) - Chasing Fairness under Distribution Shift: a Model Weight Perturbation Approach from .A_Deep_Learning_Optimizer_Based_on_Grunwald_Letnikov_Fractional_Order_Definition import FCSGD_G_L, FCAdam_G_L # A Deep Learning Optimizer Based on Grunwald Letnikov Fractional Order Definition from .VFOGD_PF_and_Its_Application_in_Deep_Learning import VFOSGD_PF, VFOAdam_PF # VFOGD_PF and Its Application in Deep Learning from .staleness_corrected_momentum import SCMSGD, SCMTDProp, OptChain, FixedSGD # Correcting Momentum in Temporal Difference Learning from .DPSGD import DPSGD # Pytorch implentation of tf.privacy.DPGradientDescentGaussianOptimizer from .DPtorch import JDPSGD # Improving Deep Learning with Differential Privacy using Gradient Encoding and Denoising from .optimizer2 import AdaBound, AdaGC, AdaMod, Adan, Yogi # Adaptive Optimization Algorithm with Gradient Bias Correction (AdaGC) from .ProxSPS import SPS, ProxAdam # Polyak step sizes with weight decay in Pytorch from .bb_dl import BB # Barzilai-Borwein-based Adaptive Learning Rate for Deep Learning from .Adaptive_learning_rate_optimization_algorithm_with_dynamic_bound_based_on_Barzilai_Borwein_method import BBbound, AdaBBbound # Adaptive learning rate optimization algorithm with dynamic bound based on Barzilai-Borwein method from .mL_BFGS import SlimQN, BlockSlimQN, KFACOptimizer, LBFGSOptimizer, SGDOptimizer # A Momentum-based L-BFGS for Distributed Large-Scale Neural Network Optimization from .Noisy_SGD import GGDO1, GGDO2, GGDO3, GGDO4, GGDO5, pSGLD,SGLD # Adaptively Preconditioned Stochastic Gradient Langevin Dynamics from .adamaio import AdamAIO # All-In-One Adam from .adams import Adams, AdamUCB, AdamCB # Exploiting Uncertainty of Loss Landscape for Stochastic Optimization from .AdaTS import AdaTS, AdaITS, AdamTS, YOGITS # ???? can't find anything about "AdaTS algorithm" from .MNSAM import MNSAM, SAM, SUM, Adan # Sharpness-Aware Minimization Method with Momentum Acceleration for Deep Neural Networks from .innaprop import INNAprop # INNAprop, a second-order optimization algorithm for deep learning from .M3Learning import AdaHessian, TRCG # Trust-Region Conjugate Gradient from .opt_for_pinns import Adam_LBFGS, Adam_LBFGS_GD, Adam_LBFGS_NNCG, ALRMAG, GD, NysNewtonCG, PolyakGD, PolyakLBFGS, SketchyGN, SketchySGD # bunch of stuff from "Challenges in Training PINNs: A Loss Landscape Perspective" from .deepxde import NNCG # NysNewtonCG, a damped Newton-CG method that uses Nyström preconditioning from .alternating_layers import DampedNewton # seemingly a good DampedNewton (they apply it to final layer rest optimized by 1st order) from .nanoGPTSLS import AdamSLS, KenSLS # training nanoGPT with SLS from .Skoltech3D import BatchBFGS, BatchGD # batch bfgs? whats that (those don't inherit Optimizer but have same signature) from .ICNN_verification import SdLBFGS, SdLBFGS0 # Stochastic quasi-Newton methods for nonconvex stochastic optimization from .ZO_AdaMM_vs_FO_AdaMM import AdaMM # ON THE CONVERGENCE OF ADAM AND BEYON from .AdaSTE import BayesBiNN, FenBPOpt, FenBPOptQuad, FenBPOptProx, MDTanhOpt # AdaSTE: An adaptive Straight-Through Estimator to Train Binary Neural Networks, Training Binary Neural Networks using the Bayesian Learning Rule from .alopex import Alopex # ALgorithm Of Pattern EXtraction (ALOPEX) 99/B version from .statopt import QHM, SALSA, SSLS, SASA, SLOPE # ???idk from .superpolyak import SuperPolyak, NewtonCG # SuperPolyak subgradient method - first-order method for solving (possibly) nonsmooth equations/optimization problems from .GaussNewtonPolyak import GNP # A linearly convergent Gauss-Newton subgradient method for ill-conditioned problems from .ntd import NTD, Polyak # Normal-Tangent-Descent (A nearly linearly convergent first-order method for nonsmooth functions with quadratic growth) from .nuqls import LaplaceGGN # Uncertainty Quantification with the Empirical Tangent Kernel from .SimuRLacra import GSS # Golden Section Search (I think this is gradient free) from .gcopt import GCOptimizer # Gaussian continuation optimizer (wraps another optimizer, and ultra-recent) from .k_fac import KFACOptimizer,KFACIDOptimizer,SKFACOptimizer,EKFACOptimizer,KBFGSOptimizer,KBFGSLOptimizer,KBFGSL2LOOPOptimizer,KBFGSLMEOptimizer,NGDOptimizer # biggest k-fac repo (i fixed all acc_stats)  - 
        
inikishev created this gist
Dec 6, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,290 @@ # pylint: disable = reimported # ruff: noqa: F811 # ------------------------ OTHER ONES I HAVE INSTALLED ----------------------- # # from pytorch_optimizer import ... # from OPTAMI import GradientDescent, SimilarTriangles, CubicRegularizedNewton, BasicTensorMethod, DampedNewton, NesterovAcceleration, NearOptimalAcceleration, ProxPointSegmentSearch, NATA, Optimal # from heavyball import ... # from schedulefree import SGDScheduleFreeClosure, AdamWScheduleFreeClosure, RAdamScheduleFreeClosure, ScheduleFreeWrapper # from timm.optim import ... # ----------------------------------- misc ----------------------------------- # from .Optimizer_PyTorch import AdaBound, AdaBoundW, Adam, ErrorFeedbackSGD, ExtraAdam, ExtraSGD, OptimisticAdam, OMD, SGD, Storm from .PersonalCodeRepository import SVRG, ErrorFeedbackSGD from .sota_data_augmentation_and_optimizers import RAdam, DeepMemory, Lookahead from .Awesome_Optimizers import * # insane number of them from .moai import * # insane number of them from .collie import AdaLomo, Adan, Lion, Lomo, SophiaG from .pyutils import Adam_GC,DAdaptAdam,DAdaptSGD,GLD,Lookahead,Prodigy,RAdam,SAM,SGD_GC,SMTP # ----------------------------------- repos ---------------------------------- # from .kron_torch import Kron # Kronecker-factored preconditioner from .MEKF_MAME import MEKF, MEKF_MA, Lookahead # Modified Extended Kalman Filter with generalized exponential Moving Average from .NGD_SGD import NGD # Natural gradient descent from .psgd_torch import LRA, Affine, Kron, Newton, XMat # Preconditioned gradient descent from .psiDAG import UniversalSGD # Universal Stochastic Gradient Method from .RiemannianSGD import HyperboloidRSGD, PoincareRSGD # Non euclidian space GD from .StochasticMirrorDescent import SMD_compress, SMD_qnorm # Stochastic Mirror Descent from .SUG.SUG import SUG # Adaptive stochastic gradient method based on the universal gradient method from .VTTCG import VTTCG, AdaBelief # Variable three-term conjugate gradient method from .FAdam import FAdam, AnyPrecisionAdamW # Fisher Adam from .dfw import DFW # Deep Frank Wolfe from .coolmomentum import Coolmom, Coolmomentum, Coollin # CoolMomentum: a method for stochastic optimization by Langevin dynamics with simulated annealing from .bgd import BGD # Bayesian Gradient Descent from .torchimize import GNA # Gauss-Newton algorithm from .autosgm import AutoSGM # AutoSGM: A Unifying Framework for Accelerated Learning from .torch_kfac import KFACOptimizer, EKFACOptimizer from .KFAC import KFAC, EKFAC, GKFAC # Kronecker-Factored Approximate Curvature from .SGDPH.sgdph import sgdph # SGD with Partial Hessian from .LaplacianSmoothing_GradientDescent import LS_SGD # dont work from .LS_MCMC import LSpSGLD, LSSGLD, pSGLD, SGLD # LSpSGLD, LSSGLD require "vecs" whatever that is from .DP_LSSGD import LSSGD, LSSGDTorch # dont work from .dlt import LSSGD, LSSGDTorch # dont work # Laplacian Smoothing Gradient Descent from .adashift import AdaShift # AdaShift: Decorrelation and Convergence of Adaptive Learning Rate Methods from .soap import SOAP # Shampoo with Adam in the Preconditioner's eigenbasis (SOAP). from .PAL import PalOptimizer # PAL - Parabolic Approximation Line Search for DNNs from .LABPAL import GOLSI, LabPal, PalOptimizer, PLS, Sls, SLS # The Large-Batch Parabolic Approximation Line Search (LABPAL) from .lion import Lion, LionForEach # LionForEach is not in __init__ by default so idk if it is tested # EvoLved Sign Momentum (Symbolic Discovery of Optimization Algorithms) from .adam_atan2 import AdamAtan2, AdamAtan2ForEach, AdamAtan2WithWassersteinReg # Only AdamAtan2 in __init__ # Adam with atan2 instead of epsilon (Scaling Exponents Across Parameterizations and Optimizers) from .grokfast import GrokFastAdamW # Grokfast, Accelerated Grokking by Amplifying Slow Gradients from .lbfgs import LBFGSNew, LBFGSB # Improved LBFGS and LBFGS-B optimizers. from .AdEMAMix import AdEMAMix, AdEMAMixDistributedShampoo # The AdEMAMix Optimizer: Better, Faster, Older (mixture of two EMAs) from .parameterfree import COCOB, KT, cKT # Parameter-Free Optimizers from .SimulatedAnnealing import SimulatedAnnealing # Simulated Annealing from .Positive_Negative_Momentum import PNM, AdaPNM # Positive-Negative Momentum: Manipulating Stochastic Gradient Noise to Improve Generalization from .AngularGrad import AdaBelief, diffgrad, cosangulargrad, tanangulargrad # AngularGrad: A New Optimization Technique for Angular Convergence of Convolutional Neural Networks from .PIDOptimizer import PIDOptimizer # A PID Controller Approach for Stochastic Optimization of Deep Networks from .esgd import ESGD # Stochastic non-convex second order optimizer from .pytorch_soo import * # a lot of them # Second Order Optimizers for Machine Learning from .curveball import CurveBall, CurveBallInterleave # Small Steps and Giant Leaps: Minimal Newton Solvers for Deep Learning from .torch_second_order import GradientDescent, LevenbergMarquardt # Levenberg–Marquardt algorithm from .grnewt import NewtonSummary, NewtonSummaryVanilla, NewtonSummaryFB, NewtonSummaryUniformAvg # Adapting Newton's Method to Neural Networks through a Summary of Higher-Order Derivatives from .pytorch_storm import STORM # stochastic first order trust region method from .pytorch_trish import TRish # A Stochastic Trust Region Algorithm Based on Careful Step Normalization from .fate_llm import ZerothOrderOptimizer, KSeedZerothOrderOptimizer # "This optimizer performs a `random` walk update for the parameters of the model." from .FederatedScope_FedKSeed import MeZOBiasOptimizer from .fusion_bench import MeZO # MEZO from .NewtonCG import NewtonCG # Newton-CG algorithm with backtracking line-search from .dreamplace import NesterovAcceleratedGradientOptimizer # Nesterov's implementation of e-place algorithm (???) (THIS NOT NESTEROV MOMENTUM ITS NESTEROV SOMETHING ELSE) from .sls_ffa import Sls, SlsAcc, SlsEg, SVRG, AdaBound, CocobBackprop, CocobOns, PLS # Stochastic line search (fork with more stufff) from .sps import Sps # Stochastic Polyak Step-size from .ada_sls import AdaSLS # Adaptive Gradient Methods Converge Faster with Over-Parameterization from .sls import Sls, SlsAcc, SlsEg # Stochastic line search from .chop import PGD, PGDMadry, S3CM, PairwiseFrankWolfe, FrankWolfe # constrained optimization for PyTorch from .ncg_optimizer import LCG, BASIC from .ncg_optimizer_ApostolosGreece import LCG, BASIC # fork, seems to have some kinds of changes # nonlinear conjugate gradient from .LPF_SGD import EntropySGD2, EntropyAdam, EntropySGD, SAM # Low-Pass Filtering SGD for Recovering Flat Optima (but I don't think it has LPFSGD optimizer, unless EntropySGD is one) from .optimizer import SAM, NelderMead, PatternSearch # bro made a nelder mead (Loss Landscapes are All You Need: Neural Network Generalization Can Be Explained Without the Implicit Bias of Gradient Descent) from .convis import FiniteDifferenceGradientOptimizer # apparently second order fd from .fullbatch import AdaptiveGradientClipping, FISTA, FISTALineSearch, SGDLineSearch, LARS, LBFGS, SAM, SGD_AGC, RestartingLineSearch, NonMonotoneLinesearch, WolfeGradientDescent # Training vision models with full-batch gradient descent and regularization from .peps_torch_feat_czx import SGD_MOD, LBFGS_MOD # SGD with backtracking line search from .Target_Based_Surrogates_For_Stochastic_Optimization import Ada_FMDOpt, Adam_FMDOpt, Diag_Ada_FMDOpt, GULF2, LSOpt, MD_FMDOpt, Online_Newton_FMDOpt, Sadagrad, SGD_FMDOpt, SLS_FMDOpt, SVRG # Target Based Surrogates For Stochastic Optimization (some crazy stuff) from .SDLS import SDLS # Armijo Back-tracking line search on Training DNN from .hessianfree import HessianFree # Deep learning via Hessian-free optimization (need to install backpack) from .salsa.SaLSA import SaLSA # SALSA - Stable Armijo Line Search Adaptation from .nitorch import OGM, BacktrackingLineSearch # optimizers from neuroimaging library from .qori_aziz_sa import SimulatedAnnealing # SA from someones homework from .neural_net_optimizers import GeneticAlgorithm, ParticleSwarm # dfo from .NNAIF import CMAES, EMNA, IMFIL, NNAIF, SGPGD, RESNETEULER # Neural Network Accelerated Implicit Filtering: Integrating Neural Network Surrogates With Provably Convergent Derivative Free Optimization Methods from .befree import CurveBall, HessianFree, Newton, SimplifiedHessian # On the New method of Hessian-free second-order optimization from .bayesian_snn import BayesBiSNN, GaussianBayesOptimizer # Bayesian Continual Learning via Spiking Neural Networks (I think it needs layers from that lib too) from .ML_APTS import APTS, LocalTR, TR, TRAdam # Additively preconditioned trust-region strategies for machine learning from .torchmin import Minimizer, ScipyMinimizer from .pytorch_minimize import MinimizeWrapper, BasinHoppingWrapper, DualAnnealingWrapper, DifferentialEvolutionWrapper # scipy minimize (ha ha mine is better) from .geoopt import RiemannianAdam, RiemannianLineSearch, RiemannianSGD, SparseRiemannianAdam, SparseRiemannianSGD # Riemannian Adaptive Optimization Methods (maybe only works on geoopt layers idk) from .pykg2vec import RiemannianOptimizer # from "Python library for knowledge graph embedding" (but I changed it to affect all layers) from .M_FAC import MFAC # M-FAC: Efficient Matrix-Free Approximations of Second-Order Information from .ddpnopt import Step, RmsDDP, AdamDDP # DDPNOpt: Differential Dynamic Programming Neural Optimizer from .singd import SINGD # KFAC-like Structured Inverse-Free Natural Gradient Descent from .sirfshampoo import SIRFShampoo # SIRFShampoo: Structured inverse- and root-free Shampoo in PyTorch from .StructuredNGD_DL import KFACOptimizer, LocalOptimizer # Matrix-multiplication-only KFAC (Simplifying Momentum-based Positive-definite Submanifold Optimization) from .Muon import Muon, AutoMuon # MomentUm Orthogonalized by Newton-schulz. from .orth_optim import orthogonalise # Orthogonalising gradients to speed up neural network optimisation. `orthogonalise(AdamW)(model.parameters(), lr = 1e-3)` from .torch_pso import ParticleSwarmOptimizer, GenerationalPSO,AutotuningPSO,RingTopologyPSO,ChaoticPSO,GenericPSO,AcceleratedPSO,SineCosineAlgorithm,ImprovedSineCosineAlgorithm # Particle Swarm Optimization from .langevin_sampling import SGLD, pSGLD # Sampling with gradient-based Markov Chain Monte Carlo approaches from .adopt import ADOPT # Modified Adam Can Converge with Any β2 with the Optimal Rate from .fsdp_optimizers import SOAP, Kron, Muon, KronMars # optimizers with FSDP support from .NGPlus import NGPlus, o_NGPlus, o_NGPlus_Block, create_oNG_optimizer # NG+: A new second-order optimizer for deep learning from .MARS_AdamW import MarsAdamW # MARS: Unleashing the Power of Variance Reduction for Training Large Models from .MSAM import AdamW, AdamW_MSAM, AdamW_SAM, ESAM, LookSAM, MSAM, SAM, SGD # Momentum-SAM: Sharpness Aware Minimization without Computational Overhead from .adasub import SubHes # Stochastic Optimization Using Second-Order Information in Low-Dimensional Subspaces from .MomSPS import MomSPS, MomSPS_smooth # Stochastic Polyak Step-sizes and Momentum from .momo import Momo, MomoAdam # Momentum Models for Adaptive Learning Rates from .DIMAT import CDMSGD, CDSGD, CGA, DSMA, LDSGD, SGP, SwarmSGD # Decentralized Iterative Merging-And-Training for Deep Learning Models from .Noise_stability_optimization import BSAM, NSM, SAM, RSAM # noise stability optimization algorithm, Hessian-based regularization approach for finding flat minima (NSM) from .Exponentiated_Gradient import EGPM # exponentiated gradient (EG) algorithm and plus-minus variant