Wing Lian winglian

Currently transformers attempts to offload the quantized weights to cpu on each rank. We know from answer.ai that we only need to do this on rank0 and load the parameter weights to the meta device on all other ranks.

pip uninstall transformers
pip install "transformers @ git+https://github.com/winglian/transformers.git@fsdp-meta-sharding"

Also, when loading pre-quantized weights, bitsandbytes doesn't set the quant_state needed for FSDP. @matthewdouglas will have a PR up for this fix soon. In the meantime, you can use the branch below.

	# base_model: meta-llama/Llama-2-7b-hf
	# base_model_config: meta-llama/Llama-2-7b-hf
	base_model: huggyllama/llama-7b
	base_model_config: huggyllama/llama-7b
	model_type: LlamaForCausalLM
	tokenizer_type: LlamaTokenizer
	load_in_8bit: false
	load_in_4bit: false
	strict: false
	push_dataset_to_hub:

	import argparse
	import sys
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import transformers
	import torch

	"""
	ex: `cat prompt.txt \| accelerate launch --num_processes=1 pipeline.py --model openaccess-ai-collective/minotaur-13b`
	"""

	base_model: AlekseyKorshuk/gpt4all-j-groovy
	base_model_config: AlekseyKorshuk/gpt4all-j-groovy
	load_in_8bit: false
	load_in_4bit: true
	strict: false
	push_dataset_to_hub:
	datasets:
	- path: teknium/GPT4-LLM-Cleaned
	type: alpaca
	dataset_prepared_path: last_run_prepared

	base_model: cerebras/Cerebras-GPT-1.3B
	base_model_config: cerebras/Cerebras-GPT-1.3B
	load_in_8bit: false
	load_in_4bit: true
	strict: false
	push_dataset_to_hub:
	datasets:
	- path: teknium/GPT4-LLM-Cleaned
	type: alpaca
	dataset_prepared_path: last_run_prepared

	import time
	from contextlib import suppress

	import torch
	import torch.nn as nn
	import torch.optim as optim
	import torch.nn.functional as F
	import torch.backends.cuda as cuda
	from torch.utils.data import DataLoader, IterableDataset

	import random
	from collections import defaultdict
	from random import randint

	from dotenv import load_dotenv
	load_dotenv()

	import json
	import re
	from langchain import PromptTemplate, OpenAI

	package com.foo

	import io.temporal.activity.ActivityInterface;
	import io.temporal.activity.ActivityMethod;
	import io.temporal.activity.ActivityOptions;
	import io.temporal.client.WorkflowClient;
	import io.temporal.client.WorkflowOptions;
	import io.temporal.testing.TestWorkflowEnvironment;
	import io.temporal.testing.TestWorkflowExtension;
	import io.temporal.worker.Worker;

	config/vicuna.yml
	---
	base_model: anon8231489123/vicuna-13b-GPTQ-4bit-128g
	base_model_config: anon8231489123/vicuna-13b-GPTQ-4bit-128g
	model_type: LlamaForCausalLM
	tokenizer_type: LlamaTokenizer
	load_in_8bit: false
	load_4bit: true
	gptq_groupsize: 128
	gptq_model_v1: false