Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save inikishev/7c446fe4ddfb5c7e5611498bf2b7b82c to your computer and use it in GitHub Desktop.
Save inikishev/7c446fe4ddfb5c7e5611498bf2b7b82c to your computer and use it in GitHub Desktop.

Revisions

  1. inikishev revised this gist Mar 10, 2025. 1 changed file with 96 additions and 2 deletions.
    Original file line number Diff line number Diff line change
    @@ -252,7 +252,7 @@
    from .StructuredNGD_DL import KFACOptimizer, LocalOptimizer
    # Matrix-multiplication-only KFAC (Simplifying Momentum-based Positive-definite Submanifold Optimization)

    from .Muon import Muon, AutoMuon
    from .Muon import Muon
    # MomentUm Orthogonalized by Newton-schulz.

    from .orth_optim import orthogonalise
    @@ -532,4 +532,98 @@
    # keeps the norm of each parameter vector fixed and mean at zero during the optimization process

    from .SOAP_MUON import SOAP_Muon
    # SOAP + Muon = SOAP_Muon
    # SOAP + Muon = SOAP_Muon

    from .psgd_kron_lra import KronLRA
    # LRA per kron factor

    from .psgd_kron_contraction import Kron
    # joint learning of Xilin Li's criterion 3 as well as Madeleine Udell's contraction factor on the Lie group

    from .Moonlight import Muon
    # Muon with lr normalization based on param size and maybe other stuff

    from .FedPD import PSVRG, PSGD, FedPD_SGD, FedPD_VR
    # Federated Primal-Dual Algorithm

    from .llmart import GreedyCoordinateGradient
    # greedy coordinate gradient

    from .EOPC import Rosen
    # Optimizing Mode Connectivity for Class Incremental Learning

    from .Autoenv import IEKF
    # iterative extended kalman filter optimizer

    from .fastr import FastrD, FastrN, STORMplus, StormPlus
    # Fully Adaptive STochastic Recursive-momentum

    from .NeuralNetwork import SLBI, SLBI2, SLBI_ADAM_ToolBox, SLBI_SGD_ToolBox
    from .DessiLBI import SLBI, SLBI_ToolBox
    # Exploring Structural Sparsity of Deep Networks via Inverse Scale Spaces

    from .dowg import DoWG, CDoWG
    # DoWG Unleashed: An Efficient Universal Parameter-Free Gradient Descent Method

    from .archai import CocobBackprop, CocobOns, Lamb
    # microsofts NAS lib

    from .coin_betting import SGDOL, Cocob, Recursive, Regralizer, Scinol2, ONSBet
    # Parameter-free coin betting optimizers

    from .dolphinflow import DolphinFlow
    # recent muon/adamw like has a bunch of settings to tune https://github.com/cognitivecomputations/dolphinflow-optimizer

    from .neosr import adamw_win, adan_sf, adamw_sf, adan, soap_sf, fsam
    # from super resolution lib and stuff adapted from heavyball

    from .recpre import SOAP, LionW, SophiaG, Lilith, ELLISAdam, IVON, ZeroShampooWithAdamGraftingOptimizer, OrthogonalNesterov
    # recurrent pretraining

    from .supertrainer2k import Adalite, Lilith
    # idk

    from .wu_nature_comms_2024 import NewStyleBatchFISTAOptim, NewStyleSingleFISTAOptim
    # something insane

    from .dd4ml import APTS,APTS_D, TrustRegion, TrustRegionLegacy
    # Additively preconditioned trust-region strategies for machine learning. requires some type of condig and some type of subdomain_optimizer

    from .koaning_io_more_descent_less_gradient import KeepStepping, KeepVaulting
    # keeps stepping on single batch or maybe it was supposed to be a line search idk

    from .CR import COMP
    # Compact representations for recursive Hessian matrix estimates (similar to LBFGS)

    from .MaxFactor import MaxFactor
    # utra recent

    from .scion import Scion
    # Training Deep Learning Models with Norm-Constrained LMOs.

    from .rapp import RAPPsgd, RAPPadam, ExtraAdagrad, ExtraAdam, ExtraSGD, EGplusAdam, EGplusSGD, LA, AdamLA, ExtraSGDLA, ExtraAdamLA, EGplusLA, EGplusAdamLA
    # Stable Nonconvex-Nonconcave Training via Linear Interpolation

    from .storm_plus import STORMplus
    # STORM+

    from .AccSGD import AccSGD
    # On the insufficiency of existing momentum schemes for Stochastic Optimization

    from .AdaInject import AdaBelief, AdaBeliefInject, AdamInject, diffGrad, diffGradInject,Radam, RadamInject
    # AdaInject: Injection Based Adaptive Gradient Descent Optimizers for Convolutional Neural Networks

    from .PowerSign_and_AddSign import AddSign, PowerSign
    # https://github.com/Neoanarika/Implementing-the-PowerSign-and-AddSign-rule
    from .AddSign_PowerSign_in_PyTorch import AddSign, PowerSign, LinearInternalDecay, CosineInternalDecay, RestartCosineInternalDecay
    # https://github.com/cydonia999/AddSign_PowerSign_in_PyTorch
    # Neural Optimiser search with Reinforcment learning

    from .neumann_optimizer import Neumann, Neumann2
    # https://github.com/jayybhatt/neumann-optimizer
    # A Practical Optimization Algorithm for Deep Neural Networks (implicitly computes the inverse Hessian of each mini-batch to produce descent directions)

    from .neural_search_optimizer import Optimizer_1
    # https://github.com/daviddao/pytorch-neural-search-optimizer
    # Neural Optimizer Search's Optimizer_1

  2. inikishev revised this gist Feb 10, 2025. 1 changed file with 92 additions and 8 deletions.
    Original file line number Diff line number Diff line change
    @@ -2,11 +2,14 @@
    # ruff: noqa: F811

    # ------------------------ OTHER ONES I HAVE INSTALLED ----------------------- #
    # from pytorch_optimizer import ...
    # from OPTAMI import GradientDescent, SimilarTriangles, CubicRegularizedNewton, BasicTensorMethod, DampedNewton, NesterovAcceleration, NearOptimalAcceleration, ProxPointSegmentSearch, NATA, Optimal
    # from heavyball import ...
    # from schedulefree import SGDScheduleFreeClosure, AdamWScheduleFreeClosure, RAdamScheduleFreeClosure, ScheduleFreeWrapper
    # from timm.optim import ...
    from pytorch_optimizer import ADOPT, AdaBelief #, ...
    from OPTAMI import GradientDescent, SimilarTriangles, CubicRegularizedNewton, BasicTensorMethod, DampedNewton, NesterovAcceleration, NearOptimalAcceleration, ProxPointSegmentSearch, NATA, Optimal
    from heavyball import ForeachSFAdamW, PaLMForeachSFAdamW, ForeachADOPT, ForeachMuon, ForeachLaProp, MuonLaProp, ForeachSOAP, PaLMForeachSOAP, PrecondScheduleForeachSOAP, PrecondSchedulePaLMForeachSOAP, ForeachPSGDKron, ForeachPurePSGD, ForeachCachedDelayedPSGDKron, ForeachCachedPSGDKron, ForeachDelayedPSGD
    from schedulefree import SGDScheduleFreeClosure, AdamWScheduleFreeClosure, RAdamScheduleFreeClosure, ScheduleFreeWrapper
    from timm.optim import AdaBelief, Adafactor #, ...

    # ----------------------------------- MINE ----------------------------------- #
    from torchzero.optim import Adagrad, AdamW #, ...

    # ----------------------------------- misc ----------------------------------- #
    from .Optimizer_PyTorch import AdaBound, AdaBoundW, Adam, ErrorFeedbackSGD, ExtraAdam, ExtraSGD, OptimisticAdam, OMD, SGD, Storm
    @@ -18,6 +21,8 @@
    from .pyutils import Adam_GC,DAdaptAdam,DAdaptSGD,GLD,Lookahead,Prodigy,RAdam,SAM,SGD_GC,SMTP
    from .Best_Deep_Learning_Optimizers import madgrad_wd, Ranger, Sls, Adahessian, AdaMod, DeepMemory, DiffGrad, diffRGrad, DiffMod
    from .over9000 import AdaBelief, AdaMod, Adan, Apollo, DiffGrad, Lamb, Lookahead, Madam, MADGRAD, AdamW, RAdam, PlainRAdam, Novograd, Ralamb, Ranger, RangerLars
    from .cringe_live import AdaAbs, AdaptiveCompass, Clybius, Compass, DOPE, ExMachina, FARMSCropV3, FCompass, SAVEUS
    from .Personalized_Optimizers import FARMSCrop, FARMSCropV2, FCompass, FishMonger, FMARSCrop, FMARSCrop_ExMachina, FMARSCropV2

    # ----------------------------------- repos ---------------------------------- #
    from .kron_torch import Kron
    @@ -67,6 +72,7 @@

    from .torch_kfac import KFACOptimizer, EKFACOptimizer
    from .KFAC import KFAC, EKFAC, GKFAC
    from .torch_kfac2 import KFAC # MAYBE GOOD
    # Kronecker-Factored Approximate Curvature

    from .SGDPH.sgdph import sgdph
    @@ -424,7 +430,7 @@
    # AdaSTE: An adaptive Straight-Through Estimator to Train Binary Neural Networks, Training Binary Neural Networks using the Bayesian Learning Rule

    from .alopex import Alopex
    # ALgorithm Of Pattern EXtraction (ALOPEX) 99/B version
    # ALgorithm Of Pattern EXtraction (ALOPEX) 99/B version (gradient free)

    from .statopt import QHM, SALSA, SSLS, SASA, SLOPE
    # ???idk
    @@ -442,10 +448,88 @@
    # Uncertainty Quantification with the Empirical Tangent Kernel

    from .SimuRLacra import GSS
    # Golden Section Search (I think this is gradient free)
    # Golden Section Search (I think this is gradient free and for univariate funcs)

    from .gcopt import GCOptimizer
    # Gaussian continuation optimizer (wraps another optimizer, and ultra-recent)

    from .k_fac import KFACOptimizer,KFACIDOptimizer,SKFACOptimizer,EKFACOptimizer,KBFGSOptimizer,KBFGSLOptimizer,KBFGSL2LOOPOptimizer,KBFGSLMEOptimizer,NGDOptimizer
    # biggest k-fac repo (i fixed all acc_stats)
    # biggest k-fac repo (i fixed all acc_stats)

    from .proxyprox import ProxyProx
    # konstmish's mysterious ProxyProx (has step as well as inner_step methods)

    from .SWANOptimizer import SWAN
    # SWAN (SGD with Whitening And Normalization)

    from .sparse_szo import DuelingEvolutionOptimizer, VanillaEvolutionOptimizer, OneSideEvolutionOptimizer, TwoSideEvolutionOptimizer, FirstOrderOptimizer, FirstOrderBanditOptimizer
    # Sparse Perturbations for Improved Convergence in Stochastic Zeroth-Order Optimization

    from .PSGD_Nuon import Nuon, AutoNuon
    # Use single sided whitening that is dynamic and learned instead of being instantanious like Muon

    from .coherent_gradients import RA3,RM3, M3
    # Weak and Strong Gradient Directions: Explaining Memorization, Generalization, and Hardness of Examples at Scale

    from .eva import Eva, EvaExperimental, KFAC, AdaKFAC, AdaKFAC2, KFACSAM, MFAC, Shampoo
    # Eva: Practical Second-order Optimization with Kronecker-vectorized Approximation (pretty sure they modify the gradient and don't update params)

    from .natural_galore import SubSpaceAdamW
    # GaLore extension - Natural Gradient Descent in low rank subspace

    from .galore_torch import GaLoreAdafactor, GaLoreAdamW, GaLoreAdamW8bit
    # Memory-Efficient LLM Training by Gradient Low-Rank Projection

    from .compass_optimizer import CompassExperimental4Bit, CompassExperimental8Bit, CompassExperimental8BitBNB, Compasstic, LPFAdamW, AdamW, RMSProp
    # A modification of original ADAMW optimizer by replacing momentum moment with smoothing filter.

    from .sgd_sai import SGD_sai
    # No More Adam: Learning Rate Scaling at Initialization is All You Need

    from .unrl import EligibilityTraceOptimizer,KFACOptimizer
    # optimizers from Reinforcement Learning algorithms library

    from .second_order_optimization_NQS import SecondOrderOpt
    # Second-order Optimisation strategies for neural network quantum states

    from .ldadamw_torch import LDAdamW
    # Low-Dimensional Adam - Adaptive Optimization from Low-Dimensional Gradient Statistics.

    from .pydrsom import DRSOMB, DRSOMB2, DRSOMK, DRSOMVec, KDRSOM
    # dimension-reduced second-order method (DRSOM)

    from .AdaGL import AdaGL, FractionalSmoothLoss
    # deep learning optimizer that combines fractional-order calculus with adaptive techniques. Using Grünwald–Letnikov derivatives

    from .mkor import MKOR
    # Momentum-Enabled Kronecker-Factor-Based Optimizer Using Rank-1 Updates

    from .sn_sm import GenericOptim, AdamWSN, AdamWSNG
    # GenericOptim is maybe it, Subset-Norm and Subspace-Momentum: Faster Memory-Efficient Adaptive Optimization with Convergence Guarantees

    from .OptML_Project import Adasub, Adahessian
    # Comparison of second-order optimizers on transformers

    from .MARS import MARS, ADOPT, Muon, AdamW
    # MARS (Make vAriance Reduction Shine

    from .pytorch_velo import VeLO
    # learned optimizer LSTM (just a pytorch wrapper for jax optimizer)

    from .mctorch import ConjugateGradient
    # other optimizers from this are for manifolds only, this works on any layers

    from .modded_nanogpt import Kron
    # kron fork by evanatyourservice with recent changes

    from .smplifyx import TrustRegionNewtonCG, LBFGS
    # trust region newton cg

    from .widedeepnetworks import ESS, HMC
    # Gaussian Process Behaviour in Wide Deep Neural Networks (zeroth order?)

    from .alf import NeroPlus, AdamTF
    # keeps the norm of each parameter vector fixed and mean at zero during the optimization process

    from .SOAP_MUON import SOAP_Muon
    # SOAP + Muon = SOAP_Muon
  3. inikishev revised this gist Dec 20, 2024. 1 changed file with 163 additions and 2 deletions.
    Original file line number Diff line number Diff line change
    @@ -16,6 +16,8 @@
    from .moai import * # insane number of them
    from .collie import AdaLomo, Adan, Lion, Lomo, SophiaG
    from .pyutils import Adam_GC,DAdaptAdam,DAdaptSGD,GLD,Lookahead,Prodigy,RAdam,SAM,SGD_GC,SMTP
    from .Best_Deep_Learning_Optimizers import madgrad_wd, Ranger, Sls, Adahessian, AdaMod, DeepMemory, DiffGrad, diffRGrad, DiffMod
    from .over9000 import AdaBelief, AdaMod, Adan, Apollo, DiffGrad, Lamb, Lookahead, Madam, MADGRAD, AdamW, RAdam, PlainRAdam, Novograd, Ralamb, Ranger, RangerLars

    # ----------------------------------- repos ---------------------------------- #
    from .kron_torch import Kron
    @@ -223,7 +225,7 @@
    from .pytorch_minimize import MinimizeWrapper, BasinHoppingWrapper, DualAnnealingWrapper, DifferentialEvolutionWrapper
    # scipy minimize (ha ha mine is better)

    from .geoopt import RiemannianAdam, RiemannianLineSearch, RiemannianSGD, SparseRiemannianAdam, SparseRiemannianSGD
    from .geoopt import SGRHMC, RHMC, RSGLD, RiemannianAdam, RiemannianLineSearch, RiemannianSGD, SparseRiemannianAdam, SparseRiemannianSGD
    # Riemannian Adaptive Optimization Methods (maybe only works on geoopt layers idk)

    from .pykg2vec import RiemannianOptimizer
    @@ -287,4 +289,163 @@
    # noise stability optimization algorithm, Hessian-based regularization approach for finding flat minima (NSM)

    from .Exponentiated_Gradient import EGPM
    # exponentiated gradient (EG) algorithm and plus-minus variant
    # exponentiated gradient (EG) algorithm and plus-minus variant

    from .zeroptim import MeZO, SmartES
    # zero-order optimization techniques

    from .GDPolyak import GDPolyak
    # Gradient descent with adaptive stepsize converges (nearly) linearly under fourth-order growth

    from .APROX import Truncated, TruncatedAdagrad
    # APROX: Robust Stochastic Optimization Algorithms

    from .SVRG_Pytorch import SVRG
    # efficient variant of SVRG that relies on mini-batching implemented in Pytorch

    from .poincare_embeddings import RiemannianSGD
    # actually working riemannian SGD

    from .tram_optimizer import TRAM
    # Trust Region Aware Minimization

    from .gsam import GSAM
    # Surrogate Gap Guided Sharpness-Aware Minimization

    from .ReinventingWheel import FTRLP
    # FTRL-proximal algorithm (Follow-the-Regularized-Leader and Mirror Descent: Equivalence Theorems and L1 Regularization, H. B. Mcmahan. AISTATS 2011.)

    from .OLoptim import FTML, FTRL_Proximal, OSD, SGDOL_global, SGD_globLR, STORM
    # Online & Stochastic optimization algorithms for deep learning

    from .metaopt import SGD_Multi_LR, SGD_Quotient_LR
    # Online hyperparameter optimization by real-time recurrent learning

    from .core_optimizer import CoRe
    # Continual Resilient (CoRe) Optimizer

    from .Seminar import Ada_Grad, FTRL, nAda_Grad, nFTRL, nKT, nOGD, OGD
    # "Implementation of different algorithms and their normalized counterparts in the pytorch framework"

    from .Recommendation_System_Method_Reproduce import FTRL
    from .Code import FTRL, OBC
    from .ftrl import FTRL
    from .DP_FTRL import FTRLOptimizer # official implementation by Google
    # Follow-the-Regularized-Leader

    from .smart_grid import AdaX
    # AdaX: Adaptive Gradient Descent with Exponential Long Term Memory

    from .nerf_atlas import UniformAdam
    # something crazy with solves and laplacian matrix??

    from .mlopt import Adahessian, Goldstein, Normalized_Optimizer, OneBit_Adam, SAM, Alternate_SAM, Alternate_SAM_v2, Alternate_SAM_v3, AdamS_v1, ASAM_ON, Sketch_Adam, SophiaG, Sophus, GN_DOM_SGD, GN_BULK_SGD, DOM_SGD, BULK_SGD
    # crazy stuff (no descriptions)

    from .subgd import PCAOptimizer
    # Few-Shot Learning by Dimensionality Reduction in Gradient Space (needs some kind of config)

    from .RFR_NeurIPS23 import RFR
    # robust fairness regularization (RFR) - Chasing Fairness under Distribution Shift: a Model Weight Perturbation Approach

    from .A_Deep_Learning_Optimizer_Based_on_Grunwald_Letnikov_Fractional_Order_Definition import FCSGD_G_L, FCAdam_G_L
    # A Deep Learning Optimizer Based on Grunwald Letnikov Fractional Order Definition

    from .VFOGD_PF_and_Its_Application_in_Deep_Learning import VFOSGD_PF, VFOAdam_PF
    # VFOGD_PF and Its Application in Deep Learning

    from .staleness_corrected_momentum import SCMSGD, SCMTDProp, OptChain, FixedSGD
    # Correcting Momentum in Temporal Difference Learning

    from .DPSGD import DPSGD
    # Pytorch implentation of tf.privacy.DPGradientDescentGaussianOptimizer

    from .DPtorch import JDPSGD
    # Improving Deep Learning with Differential Privacy using Gradient Encoding and Denoising

    from .optimizer2 import AdaBound, AdaGC, AdaMod, Adan, Yogi
    # Adaptive Optimization Algorithm with Gradient Bias Correction (AdaGC)

    from .ProxSPS import SPS, ProxAdam
    # Polyak step sizes with weight decay in Pytorch

    from .bb_dl import BB
    # Barzilai-Borwein-based Adaptive Learning Rate for Deep Learning

    from .Adaptive_learning_rate_optimization_algorithm_with_dynamic_bound_based_on_Barzilai_Borwein_method import BBbound, AdaBBbound
    # Adaptive learning rate optimization algorithm with dynamic bound based on Barzilai-Borwein method

    from .mL_BFGS import SlimQN, BlockSlimQN, KFACOptimizer, LBFGSOptimizer, SGDOptimizer
    # A Momentum-based L-BFGS for Distributed Large-Scale Neural Network Optimization

    from .Noisy_SGD import GGDO1, GGDO2, GGDO3, GGDO4, GGDO5, pSGLD,SGLD
    # Adaptively Preconditioned Stochastic Gradient Langevin Dynamics

    from .adamaio import AdamAIO
    # All-In-One Adam

    from .adams import Adams, AdamUCB, AdamCB
    # Exploiting Uncertainty of Loss Landscape for Stochastic Optimization

    from .AdaTS import AdaTS, AdaITS, AdamTS, YOGITS
    # ???? can't find anything about "AdaTS algorithm"

    from .MNSAM import MNSAM, SAM, SUM, Adan
    # Sharpness-Aware Minimization Method with Momentum Acceleration for Deep Neural Networks

    from .innaprop import INNAprop
    # INNAprop, a second-order optimization algorithm for deep learning

    from .M3Learning import AdaHessian, TRCG
    # Trust-Region Conjugate Gradient

    from .opt_for_pinns import Adam_LBFGS, Adam_LBFGS_GD, Adam_LBFGS_NNCG, ALRMAG, GD, NysNewtonCG, PolyakGD, PolyakLBFGS, SketchyGN, SketchySGD
    # bunch of stuff from "Challenges in Training PINNs: A Loss Landscape Perspective"

    from .deepxde import NNCG
    # NysNewtonCG, a damped Newton-CG method that uses Nyström preconditioning

    from .alternating_layers import DampedNewton
    # seemingly a good DampedNewton (they apply it to final layer rest optimized by 1st order)

    from .nanoGPTSLS import AdamSLS, KenSLS
    # training nanoGPT with SLS

    from .Skoltech3D import BatchBFGS, BatchGD
    # batch bfgs? whats that (those don't inherit Optimizer but have same signature)

    from .ICNN_verification import SdLBFGS, SdLBFGS0
    # Stochastic quasi-Newton methods for nonconvex stochastic optimization

    from .ZO_AdaMM_vs_FO_AdaMM import AdaMM
    # ON THE CONVERGENCE OF ADAM AND BEYON

    from .AdaSTE import BayesBiNN, FenBPOpt, FenBPOptQuad, FenBPOptProx, MDTanhOpt
    # AdaSTE: An adaptive Straight-Through Estimator to Train Binary Neural Networks, Training Binary Neural Networks using the Bayesian Learning Rule

    from .alopex import Alopex
    # ALgorithm Of Pattern EXtraction (ALOPEX) 99/B version

    from .statopt import QHM, SALSA, SSLS, SASA, SLOPE
    # ???idk

    from .superpolyak import SuperPolyak, NewtonCG
    # SuperPolyak subgradient method - first-order method for solving (possibly) nonsmooth equations/optimization problems

    from .GaussNewtonPolyak import GNP
    # A linearly convergent Gauss-Newton subgradient method for ill-conditioned problems

    from .ntd import NTD, Polyak
    # Normal-Tangent-Descent (A nearly linearly convergent first-order method for nonsmooth functions with quadratic growth)

    from .nuqls import LaplaceGGN
    # Uncertainty Quantification with the Empirical Tangent Kernel

    from .SimuRLacra import GSS
    # Golden Section Search (I think this is gradient free)

    from .gcopt import GCOptimizer
    # Gaussian continuation optimizer (wraps another optimizer, and ultra-recent)

    from .k_fac import KFACOptimizer,KFACIDOptimizer,SKFACOptimizer,EKFACOptimizer,KBFGSOptimizer,KBFGSLOptimizer,KBFGSL2LOOPOptimizer,KBFGSLMEOptimizer,NGDOptimizer
    # biggest k-fac repo (i fixed all acc_stats)
  4. inikishev created this gist Dec 6, 2024.
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,290 @@
    # pylint: disable = reimported
    # ruff: noqa: F811

    # ------------------------ OTHER ONES I HAVE INSTALLED ----------------------- #
    # from pytorch_optimizer import ...
    # from OPTAMI import GradientDescent, SimilarTriangles, CubicRegularizedNewton, BasicTensorMethod, DampedNewton, NesterovAcceleration, NearOptimalAcceleration, ProxPointSegmentSearch, NATA, Optimal
    # from heavyball import ...
    # from schedulefree import SGDScheduleFreeClosure, AdamWScheduleFreeClosure, RAdamScheduleFreeClosure, ScheduleFreeWrapper
    # from timm.optim import ...

    # ----------------------------------- misc ----------------------------------- #
    from .Optimizer_PyTorch import AdaBound, AdaBoundW, Adam, ErrorFeedbackSGD, ExtraAdam, ExtraSGD, OptimisticAdam, OMD, SGD, Storm
    from .PersonalCodeRepository import SVRG, ErrorFeedbackSGD
    from .sota_data_augmentation_and_optimizers import RAdam, DeepMemory, Lookahead
    from .Awesome_Optimizers import * # insane number of them
    from .moai import * # insane number of them
    from .collie import AdaLomo, Adan, Lion, Lomo, SophiaG
    from .pyutils import Adam_GC,DAdaptAdam,DAdaptSGD,GLD,Lookahead,Prodigy,RAdam,SAM,SGD_GC,SMTP

    # ----------------------------------- repos ---------------------------------- #
    from .kron_torch import Kron
    # Kronecker-factored preconditioner

    from .MEKF_MAME import MEKF, MEKF_MA, Lookahead
    # Modified Extended Kalman Filter with generalized exponential Moving Average

    from .NGD_SGD import NGD
    # Natural gradient descent

    from .psgd_torch import LRA, Affine, Kron, Newton, XMat
    # Preconditioned gradient descent

    from .psiDAG import UniversalSGD
    # Universal Stochastic Gradient Method

    from .RiemannianSGD import HyperboloidRSGD, PoincareRSGD
    # Non euclidian space GD

    from .StochasticMirrorDescent import SMD_compress, SMD_qnorm
    # Stochastic Mirror Descent

    from .SUG.SUG import SUG
    # Adaptive stochastic gradient method based on the universal gradient method

    from .VTTCG import VTTCG, AdaBelief
    # Variable three-term conjugate gradient method

    from .FAdam import FAdam, AnyPrecisionAdamW
    # Fisher Adam

    from .dfw import DFW
    # Deep Frank Wolfe

    from .coolmomentum import Coolmom, Coolmomentum, Coollin
    # CoolMomentum: a method for stochastic optimization by Langevin dynamics with simulated annealing

    from .bgd import BGD
    # Bayesian Gradient Descent

    from .torchimize import GNA
    # Gauss-Newton algorithm

    from .autosgm import AutoSGM
    # AutoSGM: A Unifying Framework for Accelerated Learning

    from .torch_kfac import KFACOptimizer, EKFACOptimizer
    from .KFAC import KFAC, EKFAC, GKFAC
    # Kronecker-Factored Approximate Curvature

    from .SGDPH.sgdph import sgdph
    # SGD with Partial Hessian

    from .LaplacianSmoothing_GradientDescent import LS_SGD # dont work
    from .LS_MCMC import LSpSGLD, LSSGLD, pSGLD, SGLD # LSpSGLD, LSSGLD require "vecs" whatever that is
    from .DP_LSSGD import LSSGD, LSSGDTorch # dont work
    from .dlt import LSSGD, LSSGDTorch # dont work
    # Laplacian Smoothing Gradient Descent

    from .adashift import AdaShift
    # AdaShift: Decorrelation and Convergence of Adaptive Learning Rate Methods

    from .soap import SOAP
    # Shampoo with Adam in the Preconditioner's eigenbasis (SOAP).

    from .PAL import PalOptimizer
    # PAL - Parabolic Approximation Line Search for DNNs

    from .LABPAL import GOLSI, LabPal, PalOptimizer, PLS, Sls, SLS
    # The Large-Batch Parabolic Approximation Line Search (LABPAL)

    from .lion import Lion, LionForEach # LionForEach is not in __init__ by default so idk if it is tested
    # EvoLved Sign Momentum (Symbolic Discovery of Optimization Algorithms)

    from .adam_atan2 import AdamAtan2, AdamAtan2ForEach, AdamAtan2WithWassersteinReg # Only AdamAtan2 in __init__
    # Adam with atan2 instead of epsilon (Scaling Exponents Across Parameterizations and Optimizers)

    from .grokfast import GrokFastAdamW
    # Grokfast, Accelerated Grokking by Amplifying Slow Gradients

    from .lbfgs import LBFGSNew, LBFGSB
    # Improved LBFGS and LBFGS-B optimizers.

    from .AdEMAMix import AdEMAMix, AdEMAMixDistributedShampoo
    # The AdEMAMix Optimizer: Better, Faster, Older (mixture of two EMAs)

    from .parameterfree import COCOB, KT, cKT
    # Parameter-Free Optimizers

    from .SimulatedAnnealing import SimulatedAnnealing
    # Simulated Annealing

    from .Positive_Negative_Momentum import PNM, AdaPNM
    # Positive-Negative Momentum: Manipulating Stochastic Gradient Noise to Improve Generalization

    from .AngularGrad import AdaBelief, diffgrad, cosangulargrad, tanangulargrad
    # AngularGrad: A New Optimization Technique for Angular Convergence of Convolutional Neural Networks

    from .PIDOptimizer import PIDOptimizer
    # A PID Controller Approach for Stochastic Optimization of Deep Networks

    from .esgd import ESGD
    # Stochastic non-convex second order optimizer

    from .pytorch_soo import * # a lot of them
    # Second Order Optimizers for Machine Learning

    from .curveball import CurveBall, CurveBallInterleave
    # Small Steps and Giant Leaps: Minimal Newton Solvers for Deep Learning

    from .torch_second_order import GradientDescent, LevenbergMarquardt
    # Levenberg–Marquardt algorithm

    from .grnewt import NewtonSummary, NewtonSummaryVanilla, NewtonSummaryFB, NewtonSummaryUniformAvg
    # Adapting Newton's Method to Neural Networks through a Summary of Higher-Order Derivatives

    from .pytorch_storm import STORM
    # stochastic first order trust region method

    from .pytorch_trish import TRish
    # A Stochastic Trust Region Algorithm Based on Careful Step Normalization

    from .fate_llm import ZerothOrderOptimizer, KSeedZerothOrderOptimizer
    # "This optimizer performs a `random` walk update for the parameters of the model."

    from .FederatedScope_FedKSeed import MeZOBiasOptimizer
    from .fusion_bench import MeZO
    # MEZO

    from .NewtonCG import NewtonCG
    # Newton-CG algorithm with backtracking line-search

    from .dreamplace import NesterovAcceleratedGradientOptimizer
    # Nesterov's implementation of e-place algorithm (???) (THIS NOT NESTEROV MOMENTUM ITS NESTEROV SOMETHING ELSE)

    from .sls_ffa import Sls, SlsAcc, SlsEg, SVRG, AdaBound, CocobBackprop, CocobOns, PLS
    # Stochastic line search (fork with more stufff)

    from .sps import Sps
    # Stochastic Polyak Step-size

    from .ada_sls import AdaSLS
    # Adaptive Gradient Methods Converge Faster with Over-Parameterization

    from .sls import Sls, SlsAcc, SlsEg
    # Stochastic line search

    from .chop import PGD, PGDMadry, S3CM, PairwiseFrankWolfe, FrankWolfe
    # constrained optimization for PyTorch

    from .ncg_optimizer import LCG, BASIC
    from .ncg_optimizer_ApostolosGreece import LCG, BASIC # fork, seems to have some kinds of changes
    # nonlinear conjugate gradient

    from .LPF_SGD import EntropySGD2, EntropyAdam, EntropySGD, SAM
    # Low-Pass Filtering SGD for Recovering Flat Optima (but I don't think it has LPFSGD optimizer, unless EntropySGD is one)

    from .optimizer import SAM, NelderMead, PatternSearch
    # bro made a nelder mead (Loss Landscapes are All You Need: Neural Network Generalization Can Be Explained Without the Implicit Bias of Gradient Descent)

    from .convis import FiniteDifferenceGradientOptimizer
    # apparently second order fd

    from .fullbatch import AdaptiveGradientClipping, FISTA, FISTALineSearch, SGDLineSearch, LARS, LBFGS, SAM, SGD_AGC, RestartingLineSearch, NonMonotoneLinesearch, WolfeGradientDescent
    # Training vision models with full-batch gradient descent and regularization

    from .peps_torch_feat_czx import SGD_MOD, LBFGS_MOD
    # SGD with backtracking line search

    from .Target_Based_Surrogates_For_Stochastic_Optimization import Ada_FMDOpt, Adam_FMDOpt, Diag_Ada_FMDOpt, GULF2, LSOpt, MD_FMDOpt, Online_Newton_FMDOpt, Sadagrad, SGD_FMDOpt, SLS_FMDOpt, SVRG
    # Target Based Surrogates For Stochastic Optimization (some crazy stuff)

    from .SDLS import SDLS
    # Armijo Back-tracking line search on Training DNN

    from .hessianfree import HessianFree
    # Deep learning via Hessian-free optimization (need to install backpack)

    from .salsa.SaLSA import SaLSA
    # SALSA - Stable Armijo Line Search Adaptation

    from .nitorch import OGM, BacktrackingLineSearch
    # optimizers from neuroimaging library

    from .qori_aziz_sa import SimulatedAnnealing
    # SA from someones homework

    from .neural_net_optimizers import GeneticAlgorithm, ParticleSwarm
    # dfo

    from .NNAIF import CMAES, EMNA, IMFIL, NNAIF, SGPGD, RESNETEULER
    # Neural Network Accelerated Implicit Filtering: Integrating Neural Network Surrogates With Provably Convergent Derivative Free Optimization Methods

    from .befree import CurveBall, HessianFree, Newton, SimplifiedHessian
    # On the New method of Hessian-free second-order optimization

    from .bayesian_snn import BayesBiSNN, GaussianBayesOptimizer
    # Bayesian Continual Learning via Spiking Neural Networks (I think it needs layers from that lib too)

    from .ML_APTS import APTS, LocalTR, TR, TRAdam
    # Additively preconditioned trust-region strategies for machine learning

    from .torchmin import Minimizer, ScipyMinimizer
    from .pytorch_minimize import MinimizeWrapper, BasinHoppingWrapper, DualAnnealingWrapper, DifferentialEvolutionWrapper
    # scipy minimize (ha ha mine is better)

    from .geoopt import RiemannianAdam, RiemannianLineSearch, RiemannianSGD, SparseRiemannianAdam, SparseRiemannianSGD
    # Riemannian Adaptive Optimization Methods (maybe only works on geoopt layers idk)

    from .pykg2vec import RiemannianOptimizer
    # from "Python library for knowledge graph embedding" (but I changed it to affect all layers)

    from .M_FAC import MFAC
    # M-FAC: Efficient Matrix-Free Approximations of Second-Order Information

    from .ddpnopt import Step, RmsDDP, AdamDDP
    # DDPNOpt: Differential Dynamic Programming Neural Optimizer

    from .singd import SINGD
    # KFAC-like Structured Inverse-Free Natural Gradient Descent

    from .sirfshampoo import SIRFShampoo
    # SIRFShampoo: Structured inverse- and root-free Shampoo in PyTorch

    from .StructuredNGD_DL import KFACOptimizer, LocalOptimizer
    # Matrix-multiplication-only KFAC (Simplifying Momentum-based Positive-definite Submanifold Optimization)

    from .Muon import Muon, AutoMuon
    # MomentUm Orthogonalized by Newton-schulz.

    from .orth_optim import orthogonalise
    # Orthogonalising gradients to speed up neural network optimisation. `orthogonalise(AdamW)(model.parameters(), lr = 1e-3)`

    from .torch_pso import ParticleSwarmOptimizer, GenerationalPSO,AutotuningPSO,RingTopologyPSO,ChaoticPSO,GenericPSO,AcceleratedPSO,SineCosineAlgorithm,ImprovedSineCosineAlgorithm
    # Particle Swarm Optimization

    from .langevin_sampling import SGLD, pSGLD
    # Sampling with gradient-based Markov Chain Monte Carlo approaches

    from .adopt import ADOPT
    # Modified Adam Can Converge with Any β2 with the Optimal Rate

    from .fsdp_optimizers import SOAP, Kron, Muon, KronMars
    # optimizers with FSDP support

    from .NGPlus import NGPlus, o_NGPlus, o_NGPlus_Block, create_oNG_optimizer
    # NG+: A new second-order optimizer for deep learning

    from .MARS_AdamW import MarsAdamW
    # MARS: Unleashing the Power of Variance Reduction for Training Large Models

    from .MSAM import AdamW, AdamW_MSAM, AdamW_SAM, ESAM, LookSAM, MSAM, SAM, SGD
    # Momentum-SAM: Sharpness Aware Minimization without Computational Overhead

    from .adasub import SubHes
    # Stochastic Optimization Using Second-Order Information in Low-Dimensional Subspaces

    from .MomSPS import MomSPS, MomSPS_smooth
    # Stochastic Polyak Step-sizes and Momentum

    from .momo import Momo, MomoAdam
    # Momentum Models for Adaptive Learning Rates

    from .DIMAT import CDMSGD, CDSGD, CGA, DSMA, LDSGD, SGP, SwarmSGD
    # Decentralized Iterative Merging-And-Training for Deep Learning Models

    from .Noise_stability_optimization import BSAM, NSM, SAM, RSAM
    # noise stability optimization algorithm, Hessian-based regularization approach for finding flat minima (NSM)

    from .Exponentiated_Gradient import EGPM
    # exponentiated gradient (EG) algorithm and plus-minus variant