inikishev · March 10, 2025 12:49 · Mar 10, 2025 · Feb 10, 2025 · Dec 20, 2024 · Dec 6, 2024
diff --git a/every single optimizer from every single github repo I found throughout 3 years.py b/every single optimizer from every single github repo I found throughout 3 years.py
@@ -252,7 +252,7 @@
 from .StructuredNGD_DL import KFACOptimizer, LocalOptimizer
 # Matrix-multiplication-only KFAC (Simplifying Momentum-based Positive-definite Submanifold Optimization)
 
-from .Muon import Muon, AutoMuon
+from .Muon import Muon
 # MomentUm Orthogonalized by Newton-schulz.
 
 from .orth_optim import orthogonalise
@@ -532,4 +532,98 @@
 # keeps the norm of each parameter vector fixed and mean at zero during the optimization process
 
 from .SOAP_MUON import SOAP_Muon
-# SOAP + Muon = SOAP_Muon
+# SOAP + Muon = SOAP_Muon
+
+from .psgd_kron_lra import KronLRA
+# LRA per kron factor
+
+from .psgd_kron_contraction import Kron
+# joint learning of Xilin Li's criterion 3 as well as Madeleine Udell's contraction factor on the Lie group
+
+from .Moonlight import Muon
+# Muon with lr normalization based on param size and maybe other stuff
+
+from .FedPD import PSVRG, PSGD, FedPD_SGD, FedPD_VR
+# Federated Primal-Dual Algorithm
+
+from .llmart import GreedyCoordinateGradient
+# greedy coordinate gradient
+
+from .EOPC import Rosen
+# Optimizing Mode Connectivity for Class Incremental Learning
+
+from .Autoenv import IEKF
+# iterative extended kalman filter optimizer
+
+from .fastr import FastrD, FastrN, STORMplus, StormPlus
+# Fully Adaptive STochastic Recursive-momentum
+
+from .NeuralNetwork import SLBI, SLBI2, SLBI_ADAM_ToolBox, SLBI_SGD_ToolBox
+from .DessiLBI import SLBI, SLBI_ToolBox
+# Exploring Structural Sparsity of Deep Networks via Inverse Scale Spaces
+
+from .dowg import DoWG, CDoWG
+# DoWG Unleashed: An Efficient Universal Parameter-Free Gradient Descent Method
+
+from .archai import CocobBackprop, CocobOns, Lamb
+# microsofts NAS lib
+
+from .coin_betting import SGDOL, Cocob, Recursive, Regralizer, Scinol2, ONSBet
+# Parameter-free coin betting optimizers
+
+from .dolphinflow import DolphinFlow
+# recent muon/adamw like has a bunch of settings to tune https://github.com/cognitivecomputations/dolphinflow-optimizer
+
+from .neosr import adamw_win, adan_sf, adamw_sf, adan, soap_sf, fsam
+# from super resolution lib and stuff adapted from heavyball
+
+from .recpre import SOAP, LionW, SophiaG, Lilith, ELLISAdam, IVON, ZeroShampooWithAdamGraftingOptimizer, OrthogonalNesterov
+# recurrent pretraining
+
+from .supertrainer2k import Adalite, Lilith
+#  idk
+
+from .wu_nature_comms_2024 import NewStyleBatchFISTAOptim, NewStyleSingleFISTAOptim
+# something insane
+
+from .dd4ml import APTS,APTS_D,  TrustRegion, TrustRegionLegacy
+# Additively preconditioned trust-region strategies for machine learning. requires some type of condig and some type of subdomain_optimizer
+
+from .koaning_io_more_descent_less_gradient import KeepStepping, KeepVaulting
+# keeps stepping on single batch or maybe it was supposed to be a line search idk
+
+from .CR import COMP
+# Compact representations for recursive Hessian matrix estimates (similar to LBFGS)
+
+from .MaxFactor import MaxFactor
+# utra recent
+
+from .scion import Scion
+# Training Deep Learning Models with Norm-Constrained LMOs.
+
+from .rapp import RAPPsgd, RAPPadam, ExtraAdagrad, ExtraAdam, ExtraSGD, EGplusAdam, EGplusSGD, LA, AdamLA, ExtraSGDLA, ExtraAdamLA, EGplusLA, EGplusAdamLA
+# Stable Nonconvex-Nonconcave Training via Linear Interpolation
+
+from .storm_plus import STORMplus
+# STORM+
+
+from .AccSGD import AccSGD
+# On the insufficiency of existing momentum schemes for Stochastic Optimization
+
+from .AdaInject import AdaBelief, AdaBeliefInject, AdamInject, diffGrad, diffGradInject,Radam, RadamInject
+# AdaInject: Injection Based Adaptive Gradient Descent Optimizers for Convolutional Neural Networks
+
+from .PowerSign_and_AddSign import AddSign, PowerSign
+# https://github.com/Neoanarika/Implementing-the-PowerSign-and-AddSign-rule
+from .AddSign_PowerSign_in_PyTorch import AddSign, PowerSign, LinearInternalDecay, CosineInternalDecay, RestartCosineInternalDecay
+# https://github.com/cydonia999/AddSign_PowerSign_in_PyTorch
+# Neural Optimiser search with Reinforcment learning
+
+from .neumann_optimizer import Neumann, Neumann2
+# https://github.com/jayybhatt/neumann-optimizer
+# A Practical Optimization Algorithm for Deep Neural Networks (implicitly computes the inverse Hessian of each mini-batch to produce descent directions)
+
+from .neural_search_optimizer import Optimizer_1
+# https://github.com/daviddao/pytorch-neural-search-optimizer
+# Neural Optimizer Search's Optimizer_1
+
diff --git a/every single optimizer from every single github repo I found throughout 3 years.py b/every single optimizer from every single github repo I found throughout 3 years.py
@@ -2,11 +2,14 @@
 # ruff: noqa: F811
 
 # ------------------------ OTHER ONES I HAVE INSTALLED ----------------------- #
-# from pytorch_optimizer import ...
-# from OPTAMI import GradientDescent, SimilarTriangles, CubicRegularizedNewton, BasicTensorMethod, DampedNewton, NesterovAcceleration, NearOptimalAcceleration,  ProxPointSegmentSearch, NATA, Optimal
-# from heavyball import ...
-# from schedulefree import SGDScheduleFreeClosure, AdamWScheduleFreeClosure, RAdamScheduleFreeClosure, ScheduleFreeWrapper
-# from timm.optim import ...
+from pytorch_optimizer import ADOPT, AdaBelief #, ...
+from OPTAMI import GradientDescent, SimilarTriangles, CubicRegularizedNewton, BasicTensorMethod, DampedNewton, NesterovAcceleration, NearOptimalAcceleration,  ProxPointSegmentSearch, NATA, Optimal
+from heavyball import ForeachSFAdamW, PaLMForeachSFAdamW, ForeachADOPT, ForeachMuon, ForeachLaProp, MuonLaProp, ForeachSOAP, PaLMForeachSOAP, PrecondScheduleForeachSOAP, PrecondSchedulePaLMForeachSOAP, ForeachPSGDKron, ForeachPurePSGD, ForeachCachedDelayedPSGDKron, ForeachCachedPSGDKron, ForeachDelayedPSGD
+from schedulefree import SGDScheduleFreeClosure, AdamWScheduleFreeClosure, RAdamScheduleFreeClosure, ScheduleFreeWrapper
+from timm.optim import AdaBelief, Adafactor #, ...
+
+# ----------------------------------- MINE ----------------------------------- #
+from torchzero.optim import Adagrad, AdamW #, ...
 
 # ----------------------------------- misc ----------------------------------- #
 from .Optimizer_PyTorch import AdaBound, AdaBoundW, Adam, ErrorFeedbackSGD, ExtraAdam, ExtraSGD, OptimisticAdam, OMD, SGD, Storm
@@ -18,6 +21,8 @@
 from .pyutils import Adam_GC,DAdaptAdam,DAdaptSGD,GLD,Lookahead,Prodigy,RAdam,SAM,SGD_GC,SMTP
 from .Best_Deep_Learning_Optimizers import madgrad_wd, Ranger, Sls, Adahessian, AdaMod, DeepMemory, DiffGrad, diffRGrad, DiffMod
 from .over9000 import AdaBelief, AdaMod, Adan, Apollo, DiffGrad, Lamb, Lookahead, Madam, MADGRAD, AdamW, RAdam, PlainRAdam, Novograd, Ralamb, Ranger, RangerLars
+from .cringe_live import AdaAbs, AdaptiveCompass, Clybius, Compass, DOPE, ExMachina, FARMSCropV3, FCompass, SAVEUS
+from .Personalized_Optimizers import FARMSCrop, FARMSCropV2, FCompass, FishMonger, FMARSCrop, FMARSCrop_ExMachina, FMARSCropV2
 
 # ----------------------------------- repos ---------------------------------- #
 from .kron_torch import Kron
@@ -67,6 +72,7 @@
 
 from .torch_kfac import KFACOptimizer, EKFACOptimizer
 from .KFAC import KFAC, EKFAC, GKFAC
+from .torch_kfac2 import KFAC # MAYBE GOOD
 # Kronecker-Factored Approximate Curvature
 
 from .SGDPH.sgdph import sgdph
@@ -424,7 +430,7 @@
 # AdaSTE: An adaptive Straight-Through Estimator to Train Binary Neural Networks, Training Binary Neural Networks using the Bayesian Learning Rule
 
 from .alopex import Alopex
-# ALgorithm Of Pattern EXtraction (ALOPEX) 99/B version
+# ALgorithm Of Pattern EXtraction (ALOPEX) 99/B version (gradient free)
 
 from .statopt import QHM, SALSA, SSLS, SASA, SLOPE
 # ???idk
@@ -442,10 +448,88 @@
 # Uncertainty Quantification with the Empirical Tangent Kernel
 
 from .SimuRLacra import GSS
-# Golden Section Search (I think this is gradient free)
+# Golden Section Search (I think this is gradient free and for univariate funcs)
 
 from .gcopt import GCOptimizer
 # Gaussian continuation optimizer (wraps another optimizer, and ultra-recent)
 
 from .k_fac import KFACOptimizer,KFACIDOptimizer,SKFACOptimizer,EKFACOptimizer,KBFGSOptimizer,KBFGSLOptimizer,KBFGSL2LOOPOptimizer,KBFGSLMEOptimizer,NGDOptimizer
-# biggest k-fac repo (i fixed all acc_stats)
+# biggest k-fac repo (i fixed all acc_stats)
+
+from .proxyprox import ProxyProx
+# konstmish's mysterious ProxyProx (has step as well as inner_step methods)
+
+from .SWANOptimizer import SWAN
+# SWAN (SGD with Whitening And Normalization)
+
+from .sparse_szo import DuelingEvolutionOptimizer, VanillaEvolutionOptimizer, OneSideEvolutionOptimizer, TwoSideEvolutionOptimizer, FirstOrderOptimizer, FirstOrderBanditOptimizer
+# Sparse Perturbations for Improved Convergence in Stochastic Zeroth-Order Optimization
+
+from .PSGD_Nuon import Nuon, AutoNuon
+# Use single sided whitening that is dynamic and learned instead of being instantanious like Muon
+
+from .coherent_gradients import RA3,RM3, M3
+# Weak and Strong Gradient Directions: Explaining Memorization, Generalization, and Hardness of Examples at Scale
+
+from .eva import Eva, EvaExperimental, KFAC, AdaKFAC, AdaKFAC2, KFACSAM, MFAC, Shampoo
+# Eva: Practical Second-order Optimization with Kronecker-vectorized Approximation (pretty sure they modify the gradient and don't update params)
+
+from .natural_galore import SubSpaceAdamW
+# GaLore extension - Natural Gradient Descent in low rank subspace
+
+from .galore_torch import GaLoreAdafactor, GaLoreAdamW, GaLoreAdamW8bit
+# Memory-Efficient LLM Training by Gradient Low-Rank Projection
+
+from .compass_optimizer import CompassExperimental4Bit, CompassExperimental8Bit, CompassExperimental8BitBNB, Compasstic, LPFAdamW, AdamW, RMSProp
+# A modification of original ADAMW optimizer by replacing momentum moment with smoothing filter.
+
+from .sgd_sai import SGD_sai
+# No More Adam: Learning Rate Scaling at Initialization is All You Need
+
+from .unrl import EligibilityTraceOptimizer,KFACOptimizer
+# optimizers from Reinforcement Learning algorithms library
+
+from .second_order_optimization_NQS import SecondOrderOpt
+# Second-order Optimisation strategies for neural network quantum states
+
+from .ldadamw_torch import LDAdamW
+# Low-Dimensional Adam - Adaptive Optimization from Low-Dimensional Gradient Statistics.
+
+from .pydrsom import DRSOMB, DRSOMB2, DRSOMK, DRSOMVec, KDRSOM
+# dimension-reduced second-order method (DRSOM)
+
+from .AdaGL import AdaGL, FractionalSmoothLoss
+# deep learning optimizer that combines fractional-order calculus with adaptive techniques. Using Grünwald–Letnikov derivatives
+
+from .mkor import MKOR
+# Momentum-Enabled Kronecker-Factor-Based Optimizer Using Rank-1 Updates
+
+from .sn_sm import GenericOptim, AdamWSN, AdamWSNG
+# GenericOptim is maybe it, Subset-Norm and Subspace-Momentum: Faster Memory-Efficient Adaptive Optimization with Convergence Guarantees
+
+from .OptML_Project import Adasub, Adahessian
+# Comparison of second-order optimizers on transformers
+
+from .MARS import MARS, ADOPT, Muon, AdamW
+# MARS (Make vAriance Reduction Shine
+
+from .pytorch_velo import VeLO
+# learned optimizer LSTM (just a pytorch wrapper for jax optimizer)
+
+from .mctorch import ConjugateGradient
+# other optimizers from this are for manifolds only, this works on any layers
+
+from .modded_nanogpt import Kron
+# kron fork by evanatyourservice with recent changes
+
+from .smplifyx import TrustRegionNewtonCG, LBFGS
+# trust region newton cg
+
+from .widedeepnetworks import ESS, HMC
+# Gaussian Process Behaviour in Wide Deep Neural Networks (zeroth order?)
+
+from .alf import NeroPlus, AdamTF
+# keeps the norm of each parameter vector fixed and mean at zero during the optimization process
+
+from .SOAP_MUON import SOAP_Muon
+# SOAP + Muon = SOAP_Muon
diff --git a/every single optimizer from every single github repo I found throughout 3 years.py b/every single optimizer from every single github repo I found throughout 3 years.py
@@ -16,6 +16,8 @@
 from .moai import * # insane number of them
 from .collie import AdaLomo, Adan, Lion, Lomo, SophiaG
 from .pyutils import Adam_GC,DAdaptAdam,DAdaptSGD,GLD,Lookahead,Prodigy,RAdam,SAM,SGD_GC,SMTP
+from .Best_Deep_Learning_Optimizers import madgrad_wd, Ranger, Sls, Adahessian, AdaMod, DeepMemory, DiffGrad, diffRGrad, DiffMod
+from .over9000 import AdaBelief, AdaMod, Adan, Apollo, DiffGrad, Lamb, Lookahead, Madam, MADGRAD, AdamW, RAdam, PlainRAdam, Novograd, Ralamb, Ranger, RangerLars
 
 # ----------------------------------- repos ---------------------------------- #
 from .kron_torch import Kron
@@ -223,7 +225,7 @@
 from .pytorch_minimize import MinimizeWrapper, BasinHoppingWrapper, DualAnnealingWrapper, DifferentialEvolutionWrapper
 # scipy minimize (ha ha mine is better)
 
-from .geoopt import RiemannianAdam, RiemannianLineSearch, RiemannianSGD, SparseRiemannianAdam, SparseRiemannianSGD
+from .geoopt import SGRHMC, RHMC, RSGLD, RiemannianAdam, RiemannianLineSearch, RiemannianSGD, SparseRiemannianAdam, SparseRiemannianSGD
 # Riemannian Adaptive Optimization Methods (maybe only works on geoopt layers idk)
 
 from .pykg2vec import RiemannianOptimizer
@@ -287,4 +289,163 @@
 # noise stability optimization algorithm, Hessian-based regularization approach for finding flat minima (NSM)
 
 from .Exponentiated_Gradient import EGPM
-# exponentiated gradient (EG) algorithm and plus-minus variant
+# exponentiated gradient (EG) algorithm and plus-minus variant
+
+from .zeroptim import MeZO, SmartES
+# zero-order optimization techniques
+
+from .GDPolyak import GDPolyak
+# Gradient descent with adaptive stepsize converges (nearly) linearly under fourth-order growth
+
+from .APROX import Truncated, TruncatedAdagrad
+# APROX: Robust Stochastic Optimization Algorithms
+
+from .SVRG_Pytorch import SVRG
+# efficient variant of SVRG that relies on mini-batching implemented in Pytorch
+
+from .poincare_embeddings import RiemannianSGD
+# actually working riemannian SGD
+
+from .tram_optimizer import TRAM
+# Trust Region Aware Minimization
+
+from .gsam import GSAM
+# Surrogate Gap Guided Sharpness-Aware Minimization
+
+from .ReinventingWheel import FTRLP
+ # FTRL-proximal algorithm (Follow-the-Regularized-Leader and Mirror Descent: Equivalence Theorems and L1 Regularization, H. B. Mcmahan. AISTATS 2011.)
+
+from .OLoptim import FTML, FTRL_Proximal, OSD, SGDOL_global, SGD_globLR, STORM
+# Online & Stochastic optimization algorithms for deep learning
+
+from .metaopt import SGD_Multi_LR, SGD_Quotient_LR
+# Online hyperparameter optimization by real-time recurrent learning
+
+from .core_optimizer import CoRe
+# Continual Resilient (CoRe) Optimizer
+
+from .Seminar import Ada_Grad, FTRL, nAda_Grad, nFTRL, nKT, nOGD, OGD
+# "Implementation of different algorithms and their normalized counterparts in the pytorch framework"
+
+from .Recommendation_System_Method_Reproduce import FTRL
+from .Code import FTRL, OBC
+from .ftrl import FTRL
+from .DP_FTRL import FTRLOptimizer # official implementation by Google
+# Follow-the-Regularized-Leader
+
+from .smart_grid import AdaX
+# AdaX: Adaptive Gradient Descent with Exponential Long Term Memory
+
+from .nerf_atlas import UniformAdam
+# something crazy with solves and laplacian matrix??
+
+from .mlopt import Adahessian, Goldstein, Normalized_Optimizer, OneBit_Adam, SAM, Alternate_SAM, Alternate_SAM_v2, Alternate_SAM_v3, AdamS_v1, ASAM_ON, Sketch_Adam, SophiaG, Sophus, GN_DOM_SGD, GN_BULK_SGD, DOM_SGD, BULK_SGD
+# crazy stuff (no descriptions)
+
+from .subgd import PCAOptimizer
+# Few-Shot Learning by Dimensionality Reduction in Gradient Space (needs some kind of config)
+
+from .RFR_NeurIPS23 import RFR
+# robust fairness regularization (RFR) - Chasing Fairness under Distribution Shift: a Model Weight Perturbation Approach
+
+from .A_Deep_Learning_Optimizer_Based_on_Grunwald_Letnikov_Fractional_Order_Definition import FCSGD_G_L, FCAdam_G_L
+# A Deep Learning Optimizer Based on Grunwald Letnikov Fractional Order Definition
+
+from .VFOGD_PF_and_Its_Application_in_Deep_Learning import VFOSGD_PF, VFOAdam_PF
+# VFOGD_PF and Its Application in Deep Learning
+
+from .staleness_corrected_momentum import SCMSGD, SCMTDProp, OptChain, FixedSGD
+# Correcting Momentum in Temporal Difference Learning
+
+from .DPSGD import DPSGD
+# Pytorch implentation of tf.privacy.DPGradientDescentGaussianOptimizer
+
+from .DPtorch import JDPSGD
+# Improving Deep Learning with Differential Privacy using Gradient Encoding and Denoising
+
+from .optimizer2 import AdaBound, AdaGC, AdaMod, Adan, Yogi
+# Adaptive Optimization Algorithm with Gradient Bias Correction (AdaGC)
+
+from .ProxSPS import SPS, ProxAdam
+# Polyak step sizes with weight decay in Pytorch
+
+from .bb_dl import BB
+# Barzilai-Borwein-based Adaptive Learning Rate for Deep Learning
+
+from .Adaptive_learning_rate_optimization_algorithm_with_dynamic_bound_based_on_Barzilai_Borwein_method import BBbound, AdaBBbound
+# Adaptive learning rate optimization algorithm with dynamic bound based on Barzilai-Borwein method
+
+from .mL_BFGS import SlimQN, BlockSlimQN, KFACOptimizer, LBFGSOptimizer, SGDOptimizer
+# A Momentum-based L-BFGS for Distributed Large-Scale Neural Network Optimization
+
+from .Noisy_SGD import GGDO1, GGDO2, GGDO3, GGDO4, GGDO5, pSGLD,SGLD
+# Adaptively Preconditioned Stochastic Gradient Langevin Dynamics
+
+from .adamaio import AdamAIO
+# All-In-One Adam
+
+from .adams import Adams, AdamUCB, AdamCB
+# Exploiting Uncertainty of Loss Landscape for Stochastic Optimization
+
+from .AdaTS import AdaTS, AdaITS, AdamTS, YOGITS
+# ???? can't find anything about "AdaTS algorithm"
+
+from .MNSAM import MNSAM, SAM, SUM, Adan
+# Sharpness-Aware Minimization Method with Momentum Acceleration for Deep Neural Networks
+
+from .innaprop import INNAprop
+# INNAprop, a second-order optimization algorithm for deep learning
+
+from .M3Learning import AdaHessian, TRCG
+# Trust-Region Conjugate Gradient
+
+from .opt_for_pinns import Adam_LBFGS, Adam_LBFGS_GD, Adam_LBFGS_NNCG, ALRMAG, GD, NysNewtonCG, PolyakGD, PolyakLBFGS, SketchyGN, SketchySGD
+# bunch of stuff from "Challenges in Training PINNs: A Loss Landscape Perspective"
+
+from .deepxde import NNCG
+# NysNewtonCG, a damped Newton-CG method that uses Nyström preconditioning
+
+from .alternating_layers import DampedNewton
+# seemingly a good DampedNewton (they apply it to final layer rest optimized by 1st order)
+
+from .nanoGPTSLS import AdamSLS, KenSLS
+# training nanoGPT with SLS
+
+from .Skoltech3D import BatchBFGS, BatchGD
+# batch bfgs? whats that (those don't inherit Optimizer but have same signature)
+
+from .ICNN_verification import SdLBFGS, SdLBFGS0
+# Stochastic quasi-Newton methods for nonconvex stochastic optimization
+
+from .ZO_AdaMM_vs_FO_AdaMM import AdaMM
+# ON THE CONVERGENCE OF ADAM AND BEYON
+
+from .AdaSTE import BayesBiNN, FenBPOpt, FenBPOptQuad, FenBPOptProx, MDTanhOpt
+# AdaSTE: An adaptive Straight-Through Estimator to Train Binary Neural Networks, Training Binary Neural Networks using the Bayesian Learning Rule
+
+from .alopex import Alopex
+# ALgorithm Of Pattern EXtraction (ALOPEX) 99/B version
+
+from .statopt import QHM, SALSA, SSLS, SASA, SLOPE
+# ???idk
+
+from .superpolyak import SuperPolyak, NewtonCG
+# SuperPolyak subgradient method - first-order method for solving (possibly) nonsmooth equations/optimization problems
+
+from .GaussNewtonPolyak import GNP
+# A linearly convergent Gauss-Newton subgradient method for ill-conditioned problems
+
+from .ntd import NTD, Polyak
+# Normal-Tangent-Descent (A nearly linearly convergent first-order method for nonsmooth functions with quadratic growth)
+
+from .nuqls import LaplaceGGN
+# Uncertainty Quantification with the Empirical Tangent Kernel
+
+from .SimuRLacra import GSS
+# Golden Section Search (I think this is gradient free)
+
+from .gcopt import GCOptimizer
+# Gaussian continuation optimizer (wraps another optimizer, and ultra-recent)
+
+from .k_fac import KFACOptimizer,KFACIDOptimizer,SKFACOptimizer,EKFACOptimizer,KBFGSOptimizer,KBFGSLOptimizer,KBFGSL2LOOPOptimizer,KBFGSLMEOptimizer,NGDOptimizer
+# biggest k-fac repo (i fixed all acc_stats)
diff --git a/every single optimizer from every single github repo I found throughout 3 years.py b/every single optimizer from every single github repo I found throughout 3 years.py
@@ -0,0 +1,290 @@
+# pylint: disable = reimported
+# ruff: noqa: F811
+
+# ------------------------ OTHER ONES I HAVE INSTALLED ----------------------- #
+# from pytorch_optimizer import ...
+# from OPTAMI import GradientDescent, SimilarTriangles, CubicRegularizedNewton, BasicTensorMethod, DampedNewton, NesterovAcceleration, NearOptimalAcceleration,  ProxPointSegmentSearch, NATA, Optimal
+# from heavyball import ...
+# from schedulefree import SGDScheduleFreeClosure, AdamWScheduleFreeClosure, RAdamScheduleFreeClosure, ScheduleFreeWrapper
+# from timm.optim import ...
+
+# ----------------------------------- misc ----------------------------------- #
+from .Optimizer_PyTorch import AdaBound, AdaBoundW, Adam, ErrorFeedbackSGD, ExtraAdam, ExtraSGD, OptimisticAdam, OMD, SGD, Storm
+from .PersonalCodeRepository import SVRG, ErrorFeedbackSGD
+from .sota_data_augmentation_and_optimizers import RAdam, DeepMemory, Lookahead
+from .Awesome_Optimizers import * # insane number of them
+from .moai import * # insane number of them
+from .collie import AdaLomo, Adan, Lion, Lomo, SophiaG
+from .pyutils import Adam_GC,DAdaptAdam,DAdaptSGD,GLD,Lookahead,Prodigy,RAdam,SAM,SGD_GC,SMTP
+
+# ----------------------------------- repos ---------------------------------- #
+from .kron_torch import Kron
+# Kronecker-factored preconditioner
+
+from .MEKF_MAME import MEKF, MEKF_MA, Lookahead
+# Modified Extended Kalman Filter with generalized exponential Moving Average
+
+from .NGD_SGD import NGD
+# Natural gradient descent
+
+from .psgd_torch import LRA, Affine, Kron, Newton, XMat
+# Preconditioned gradient descent
+
+from .psiDAG import UniversalSGD
+# Universal Stochastic Gradient Method
+
+from .RiemannianSGD import HyperboloidRSGD, PoincareRSGD
+# Non euclidian space GD
+
+from .StochasticMirrorDescent import SMD_compress, SMD_qnorm
+# Stochastic Mirror Descent
+
+from .SUG.SUG import SUG
+# Adaptive stochastic gradient method based on the universal gradient method
+
+from .VTTCG import VTTCG, AdaBelief
+# Variable three-term conjugate gradient method
+
+from .FAdam import FAdam, AnyPrecisionAdamW
+# Fisher Adam
+
+from .dfw import DFW
+# Deep Frank Wolfe
+
+from .coolmomentum import Coolmom, Coolmomentum, Coollin
+# CoolMomentum: a method for stochastic optimization by Langevin dynamics with simulated annealing
+
+from .bgd import BGD
+# Bayesian Gradient Descent
+
+from .torchimize import GNA
+# Gauss-Newton algorithm
+
+from .autosgm import AutoSGM
+# AutoSGM: A Unifying Framework for Accelerated Learning
+
+from .torch_kfac import KFACOptimizer, EKFACOptimizer
+from .KFAC import KFAC, EKFAC, GKFAC
+# Kronecker-Factored Approximate Curvature
+
+from .SGDPH.sgdph import sgdph
+# SGD with Partial Hessian
+
+from .LaplacianSmoothing_GradientDescent import LS_SGD # dont work
+from .LS_MCMC import LSpSGLD, LSSGLD, pSGLD, SGLD # LSpSGLD, LSSGLD require "vecs" whatever that is
+from .DP_LSSGD import LSSGD, LSSGDTorch # dont work
+from .dlt import LSSGD, LSSGDTorch # dont work
+# Laplacian Smoothing Gradient Descent
+
+from .adashift import AdaShift
+# AdaShift: Decorrelation and Convergence of Adaptive Learning Rate Methods
+
+from .soap import SOAP
+# Shampoo with Adam in the Preconditioner's eigenbasis (SOAP).
+
+from .PAL import PalOptimizer
+# PAL - Parabolic Approximation Line Search for DNNs
+
+from .LABPAL import GOLSI, LabPal, PalOptimizer, PLS, Sls, SLS
+# The Large-Batch Parabolic Approximation Line Search (LABPAL)
+
+from .lion import Lion, LionForEach # LionForEach is not in __init__ by default so idk if it is tested
+# EvoLved Sign Momentum (Symbolic Discovery of Optimization Algorithms)
+
+from .adam_atan2 import AdamAtan2, AdamAtan2ForEach, AdamAtan2WithWassersteinReg # Only AdamAtan2 in __init__
+# Adam with atan2 instead of epsilon (Scaling Exponents Across Parameterizations and Optimizers)
+
+from .grokfast import GrokFastAdamW
+# Grokfast, Accelerated Grokking by Amplifying Slow Gradients
+
+from .lbfgs import LBFGSNew, LBFGSB
+# Improved LBFGS and LBFGS-B optimizers.
+
+from .AdEMAMix import AdEMAMix, AdEMAMixDistributedShampoo
+# The AdEMAMix Optimizer: Better, Faster, Older (mixture of two EMAs)
+
+from .parameterfree import COCOB, KT, cKT
+# Parameter-Free Optimizers
+
+from .SimulatedAnnealing import SimulatedAnnealing
+# Simulated Annealing
+
+from .Positive_Negative_Momentum import PNM, AdaPNM
+# Positive-Negative Momentum: Manipulating Stochastic Gradient Noise to Improve Generalization
+
+from .AngularGrad import AdaBelief, diffgrad, cosangulargrad, tanangulargrad
+# AngularGrad: A New Optimization Technique for Angular Convergence of Convolutional Neural Networks
+
+from .PIDOptimizer import PIDOptimizer
+# A PID Controller Approach for Stochastic Optimization of Deep Networks
+
+from .esgd import ESGD
+# Stochastic non-convex second order optimizer
+
+from .pytorch_soo import * # a lot of them
+# Second Order Optimizers for Machine Learning
+
+from .curveball import CurveBall, CurveBallInterleave
+# Small Steps and Giant Leaps: Minimal Newton Solvers for Deep Learning
+
+from .torch_second_order import GradientDescent, LevenbergMarquardt
+# Levenberg–Marquardt algorithm
+
+from .grnewt import NewtonSummary, NewtonSummaryVanilla, NewtonSummaryFB, NewtonSummaryUniformAvg
+# Adapting Newton's Method to Neural Networks through a Summary of Higher-Order Derivatives
+
+from .pytorch_storm import STORM
+# stochastic first order trust region method
+
+from .pytorch_trish import TRish
+# A Stochastic Trust Region Algorithm Based on Careful Step Normalization
+
+from .fate_llm import ZerothOrderOptimizer, KSeedZerothOrderOptimizer
+# "This optimizer performs a `random` walk update for the parameters of the model."
+
+from .FederatedScope_FedKSeed import MeZOBiasOptimizer
+from .fusion_bench import MeZO
+# MEZO
+
+from .NewtonCG import NewtonCG
+# Newton-CG algorithm with backtracking line-search
+
+from .dreamplace import NesterovAcceleratedGradientOptimizer
+# Nesterov's implementation of e-place algorithm (???) (THIS NOT NESTEROV MOMENTUM ITS NESTEROV SOMETHING ELSE)
+
+from .sls_ffa import Sls, SlsAcc, SlsEg, SVRG, AdaBound, CocobBackprop, CocobOns, PLS
+# Stochastic line search (fork with more stufff)
+
+from .sps import Sps
+# Stochastic Polyak Step-size
+
+from .ada_sls import AdaSLS
+# Adaptive Gradient Methods Converge Faster with Over-Parameterization
+
+from .sls import Sls, SlsAcc, SlsEg
+# Stochastic line search
+
+from .chop import PGD, PGDMadry, S3CM, PairwiseFrankWolfe, FrankWolfe
+# constrained optimization for PyTorch
+
+from .ncg_optimizer import LCG, BASIC
+from .ncg_optimizer_ApostolosGreece import LCG, BASIC # fork, seems to have some kinds of changes
+# nonlinear conjugate gradient
+
+from .LPF_SGD import EntropySGD2, EntropyAdam, EntropySGD, SAM
+# Low-Pass Filtering SGD for Recovering Flat Optima (but I don't think it has LPFSGD optimizer, unless EntropySGD is one)
+
+from .optimizer import SAM, NelderMead, PatternSearch
+# bro made a nelder mead (Loss Landscapes are All You Need: Neural Network Generalization Can Be Explained Without the Implicit Bias of Gradient Descent)
+
+from .convis import FiniteDifferenceGradientOptimizer
+# apparently second order fd
+
+from .fullbatch import AdaptiveGradientClipping, FISTA, FISTALineSearch, SGDLineSearch, LARS, LBFGS, SAM, SGD_AGC, RestartingLineSearch, NonMonotoneLinesearch, WolfeGradientDescent
+# Training vision models with full-batch gradient descent and regularization
+
+from .peps_torch_feat_czx import SGD_MOD, LBFGS_MOD
+# SGD with backtracking line search
+
+from .Target_Based_Surrogates_For_Stochastic_Optimization import Ada_FMDOpt, Adam_FMDOpt, Diag_Ada_FMDOpt, GULF2, LSOpt, MD_FMDOpt, Online_Newton_FMDOpt, Sadagrad, SGD_FMDOpt, SLS_FMDOpt, SVRG
+# Target Based Surrogates For Stochastic Optimization (some crazy stuff)
+
+from .SDLS import SDLS
+# Armijo Back-tracking line search on Training DNN
+
+from .hessianfree import HessianFree
+# Deep learning via Hessian-free optimization (need to install backpack)
+
+from .salsa.SaLSA import SaLSA
+# SALSA - Stable Armijo Line Search Adaptation
+
+from .nitorch import OGM, BacktrackingLineSearch
+# optimizers from neuroimaging library
+
+from .qori_aziz_sa import SimulatedAnnealing
+# SA from someones homework
+
+from .neural_net_optimizers import GeneticAlgorithm, ParticleSwarm
+# dfo
+
+from .NNAIF import CMAES, EMNA, IMFIL, NNAIF, SGPGD, RESNETEULER
+# Neural Network Accelerated Implicit Filtering: Integrating Neural Network Surrogates With Provably Convergent Derivative Free Optimization Methods
+
+from .befree import CurveBall, HessianFree, Newton, SimplifiedHessian
+# On the New method of Hessian-free second-order optimization
+
+from .bayesian_snn import BayesBiSNN, GaussianBayesOptimizer
+# Bayesian Continual Learning via Spiking Neural Networks (I think it needs layers from that lib too)
+
+from .ML_APTS import APTS, LocalTR, TR, TRAdam
+# Additively preconditioned trust-region strategies for machine learning
+
+from .torchmin import Minimizer, ScipyMinimizer
+from .pytorch_minimize import MinimizeWrapper, BasinHoppingWrapper, DualAnnealingWrapper, DifferentialEvolutionWrapper
+# scipy minimize (ha ha mine is better)
+
+from .geoopt import RiemannianAdam, RiemannianLineSearch, RiemannianSGD, SparseRiemannianAdam, SparseRiemannianSGD
+# Riemannian Adaptive Optimization Methods (maybe only works on geoopt layers idk)
+
+from .pykg2vec import RiemannianOptimizer
+# from "Python library for knowledge graph embedding" (but I changed it to affect all layers)
+
+from .M_FAC import MFAC
+# M-FAC: Efficient Matrix-Free Approximations of Second-Order Information
+
+from .ddpnopt import Step, RmsDDP, AdamDDP
+# DDPNOpt: Differential Dynamic Programming Neural Optimizer
+
+from .singd import SINGD
+# KFAC-like Structured Inverse-Free Natural Gradient Descent
+
+from .sirfshampoo import SIRFShampoo
+# SIRFShampoo: Structured inverse- and root-free Shampoo in PyTorch
+
+from .StructuredNGD_DL import KFACOptimizer, LocalOptimizer
+# Matrix-multiplication-only KFAC (Simplifying Momentum-based Positive-definite Submanifold Optimization)
+
+from .Muon import Muon, AutoMuon
+# MomentUm Orthogonalized by Newton-schulz.
+
+from .orth_optim import orthogonalise
+# Orthogonalising gradients to speed up neural network optimisation. `orthogonalise(AdamW)(model.parameters(), lr = 1e-3)`
+
+from .torch_pso import ParticleSwarmOptimizer, GenerationalPSO,AutotuningPSO,RingTopologyPSO,ChaoticPSO,GenericPSO,AcceleratedPSO,SineCosineAlgorithm,ImprovedSineCosineAlgorithm
+# Particle Swarm Optimization
+
+from .langevin_sampling import SGLD, pSGLD
+# Sampling with gradient-based Markov Chain Monte Carlo approaches
+
+from .adopt import ADOPT
+# Modified Adam Can Converge with Any β2 with the Optimal Rate
+
+from .fsdp_optimizers import SOAP, Kron, Muon, KronMars
+# optimizers with FSDP support
+
+from .NGPlus import NGPlus, o_NGPlus, o_NGPlus_Block, create_oNG_optimizer
+# NG+: A new second-order optimizer for deep learning
+
+from .MARS_AdamW import MarsAdamW
+# MARS: Unleashing the Power of Variance Reduction for Training Large Models
+
+from .MSAM import AdamW, AdamW_MSAM, AdamW_SAM, ESAM, LookSAM, MSAM, SAM, SGD
+# Momentum-SAM: Sharpness Aware Minimization without Computational Overhead
+
+from .adasub import SubHes
+# Stochastic Optimization Using Second-Order Information in Low-Dimensional Subspaces
+
+from .MomSPS import MomSPS, MomSPS_smooth
+# Stochastic Polyak Step-sizes and Momentum
+
+from .momo import Momo, MomoAdam
+# Momentum Models for Adaptive Learning Rates
+
+from .DIMAT import CDMSGD, CDSGD, CGA, DSMA, LDSGD, SGP, SwarmSGD
+# Decentralized Iterative Merging-And-Training for Deep Learning Models
+
+from .Noise_stability_optimization import BSAM, NSM, SAM, RSAM
+# noise stability optimization algorithm, Hessian-based regularization approach for finding flat minima (NSM)
+
+from .Exponentiated_Gradient import EGPM
+# exponentiated gradient (EG) algorithm and plus-minus variant