cfahlgren1 · January 2, 2024 23:45 · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024
diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -22,7 +22,7 @@ dataset_prepared_path:
 val_set_size: 0.05
 adapter: qlora
 lora_model_dir:
-sequence_len: 1024
+sequence_len: 2048
 sample_packing: true
 lora_r: 8
 lora_alpha: 32
@@ -36,8 +36,8 @@ wandb_watch:
 wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out-2
-gradient_accumulation_steps: 2
-micro_batch_size: 80
+gradient_accumulation_steps: 1
+micro_batch_size: 10
 num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -18,8 +18,6 @@ datasets:
     type: alpaca
     data_files:
       - data.json
-  - path: c9bc9129-eba0-4b10-8292-4ae70fc7fa0d.json
-    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
 adapter: qlora

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -44,7 +44,7 @@ num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:
 lr_scheduler: cosine
-learning_rate: 1e-4
+learning_rate: 2e-5
 train_on_inputs: false
 group_by_length: false
 bf16: false

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -39,12 +39,12 @@ wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out-2
 gradient_accumulation_steps: 2
-micro_batch_size: 100
+micro_batch_size: 80
 num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:
 lr_scheduler: cosine
-learning_rate: 2e-4
+learning_rate: 1e-4
 train_on_inputs: false
 group_by_length: false
 bf16: false

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -39,7 +39,7 @@ wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out-2
 gradient_accumulation_steps: 2
-micro_batch_size: 64
+micro_batch_size: 100
 num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -39,7 +39,7 @@ wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out-2
 gradient_accumulation_steps: 2
-micro_batch_size: 10
+micro_batch_size: 64
 num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -38,8 +38,8 @@ wandb_watch:
 wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out-2
-gradient_accumulation_steps: 4
-micro_batch_size: 2
+gradient_accumulation_steps: 2
+micro_batch_size: 10
 num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -38,8 +38,8 @@ wandb_watch:
 wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out-2
-gradient_accumulation_steps: 16
-micro_batch_size: 1
+gradient_accumulation_steps: 4
+micro_batch_size: 2
 num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -38,14 +38,13 @@ wandb_watch:
 wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out-2
-gradient_accumulation_steps: 1
-batch_size: 8
+gradient_accumulation_steps: 16
 micro_batch_size: 1
 num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:
 lr_scheduler: cosine
-learning_rate: 0.00001
+learning_rate: 2e-4
 train_on_inputs: false
 group_by_length: false
 bf16: false

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -39,7 +39,8 @@ wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out-2
 gradient_accumulation_steps: 1
-micro_batch_size: 2
+batch_size: 8
+micro_batch_size: 1
 num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -44,7 +44,7 @@ num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:
 lr_scheduler: cosine
-learning_rate: 0.0001
+learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
 bf16: false

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -37,14 +37,14 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./qlora-out
+output_dir: ./qlora-out-2
 gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:
 lr_scheduler: cosine
-learning_rate: 0.0002
+learning_rate: 0.0001
 train_on_inputs: false
 group_by_length: false
 bf16: false

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -39,12 +39,12 @@ wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out
 gradient_accumulation_steps: 1
-micro_batch_size: 32
+micro_batch_size: 2
 num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:
 lr_scheduler: cosine
-learning_rate: 0.0064
+learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: false

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -44,7 +44,7 @@ num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:
 lr_scheduler: cosine
-learning_rate: 0.0002
+learning_rate: 0.0064
 train_on_inputs: false
 group_by_length: false
 bf16: false

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -18,7 +18,8 @@ datasets:
     type: alpaca
     data_files:
       - data.json
-
+  - path: c9bc9129-eba0-4b10-8292-4ae70fc7fa0d.json
+    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
 adapter: qlora

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -31,7 +31,7 @@ lora_dropout: 0.05
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
-wandb_project:
+wandb_project: axolotl
 wandb_entity:
 wandb_watch:
 wandb_name:

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -38,7 +38,7 @@ wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out
 gradient_accumulation_steps: 1
-micro_batch_size: 8
+micro_batch_size: 32
 num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -38,7 +38,7 @@ wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out
 gradient_accumulation_steps: 1
-micro_batch_size: 2
+micro_batch_size: 8
 num_epochs: 4
 optimizer: paged_adamw_32bit
 torchdistx_path:

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -14,6 +14,10 @@ datasets:
   - path: cfahlgren1/Capybara-Converted
     type: sharegpt
     conversation: chatml
+  - path: cfahlgren1/DevSpecCode
+    type: alpaca
+    data_files:
+      - data.json
 
 dataset_prepared_path:
 val_set_size: 0.05

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -5,11 +5,16 @@ load_in_8bit: false
 load_in_4bit: true
 strict: false
 push_dataset_to_hub:
+
 datasets:
-  - path: cfahlgren1/openhermes-15k
+  - path: cfahlgren1/openhermes-2k
     type: alpaca
     data_files:
-      - openhermes-15k.json
+      - openhermes-2k.json
+  - path: cfahlgren1/Capybara-Converted
+    type: sharegpt
+    conversation: chatml
+
 dataset_prepared_path:
 val_set_size: 0.05
 adapter: qlora

diff --git a/openllama-3b-qlora.yml b/openllama-3b-qlora.yml
@@ -0,0 +1,63 @@
+base_model: openlm-research/open_llama_3b_v2
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: cfahlgren1/openhermes-15k
+    type: alpaca
+    data_files:
+      - openhermes-15k.json
+dataset_prepared_path:
+val_set_size: 0.05
+adapter: qlora
+lora_model_dir:
+sequence_len: 1024
+sample_packing: true
+lora_r: 8
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+output_dir: ./qlora-out
+gradient_accumulation_steps: 1
+micro_batch_size: 2
+num_epochs: 4
+optimizer: paged_adamw_32bit
+torchdistx_path:
+lr_scheduler: cosine
+learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
+bf16: false
+fp16: true
+tf32: false
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+gptq_groupsize:
+gptq_model_v1:
+warmup_steps: 20
+evals_per_epoch: 4
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.1
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
No results found