使用命令行參數(shù)初始化加載器
loader.py
def __init__(self, params: dict = None):
"""
模型初始化
:param params:
"""
self.model = None
self.tokenizer = None
self.params = params or {}
self.model_name = params.get('model_name', False)
self.model_path = params.get('model_path', None)
self.no_remote_model = params.get('no_remote_model', False)
self.lora = params.get('lora', '')
self.use_ptuning_v2 = params.get('use_ptuning_v2', False)
self.lora_dir = params.get('lora_dir', '')
self.ptuning_dir = params.get('ptuning_dir', 'ptuning-v2')
self.load_in_8bit = params.get('load_in_8bit', False)
self.bf16 = params.get('bf16', False)
self.is_chatgmlcpp = "chatglm2-cpp" == self.model_name
模型實(shí)例化
shared.py
def loaderLLM(llm_model: str = None, no_remote_model: bool = False, use_ptuning_v2: bool = False) -> Any:
"""
init llm_model_ins LLM
:param llm_model: model_name
:param no_remote_model: remote in the model on loader checkpoint, if your load local model to add the ` --no-remote-model
:param use_ptuning_v2: Use p-tuning-v2 PrefixEncoder
:return:
"""
# 默認(rèn)為chatglm2-6b-32k
pre_model_name = loaderCheckPoint.model_name
# model_config中chatglm2-6b-32k對(duì)應(yīng)參數(shù)
llm_model_info = llm_model_dict[pre_model_name]
if no_remote_model:
loaderCheckPoint.no_remote_model = no_remote_model
if use_ptuning_v2:
loaderCheckPoint.use_ptuning_v2 = use_ptuning_v2
# 如果指定了參數(shù),則使用參數(shù)的配置,默認(rèn)為none
if llm_model:
llm_model_info = llm_model_dict[llm_model]
loaderCheckPoint.model_name = llm_model_info['name']
# 默認(rèn)為T(mén)HUDM/chatglm2-6b-32k
loaderCheckPoint.pretrained_model_name = llm_model_info['pretrained_model_name']
# 需手動(dòng)指定路徑
loaderCheckPoint.model_path = llm_model_info["local_model_path"]
# ChatGLMLLMChain
if 'FastChatOpenAILLM' in llm_model_info["provides"]:
loaderCheckPoint.unload_model()
else:
loaderCheckPoint.reload_model()
# 根據(jù)名稱自動(dòng)加載類:<class 'models.chatglm_llm.ChatGLMLLMChain'>
provides_class = getattr(sys.modules['models'], llm_model_info['provides'])
# 將類實(shí)例化為模型對(duì)象
modelInsLLM = provides_class(checkPoint=loaderCheckPoint)
if 'FastChatOpenAILLM' in llm_model_info["provides"]:
modelInsLLM.set_api_base_url(llm_model_info['api_base_url'])
modelInsLLM.call_model_name(llm_model_info['name'])
modelInsLLM.set_api_key(llm_model_info['api_key'])
return modelInsLLM
loader.py
def reload_model(self):
self.unload_model()
self.model_config = self._load_model_config()
if self.use_ptuning_v2:
try:
prefix_encoder_file = open(Path(f'{os.path.abspath(self.ptuning_dir)}/config.json'), 'r')
prefix_encoder_config = json.loads(prefix_encoder_file.read())
prefix_encoder_file.close()
self.model_config.pre_seq_len = prefix_encoder_config['pre_seq_len']
self.model_config.prefix_projection = prefix_encoder_config['prefix_projection']
except Exception as e:
print(e)
print("加載PrefixEncoder config.json失敗")
self.model, self.tokenizer = self._load_model()
if self.lora:
self._add_lora_to_model([self.lora])
if self.use_ptuning_v2:
try:
prefix_state_dict = torch.load(Path(f'{os.path.abspath(self.ptuning_dir)}/pytorch_model.bin'))
new_prefix_state_dict = {}
for k, v in prefix_state_dict.items():
if k.startswith("transformer.prefix_encoder."):
new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
self.model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
self.model.transformer.prefix_encoder.float()
print("加載ptuning檢查點(diǎn)成功!")
except Exception as e:
print(e)
print("加載PrefixEncoder模型參數(shù)失敗")
# llama-cpp模型(至少vicuna-13b)的eval方法就是自身,其沒(méi)有eval方法
if not self.is_llamacpp and not self.is_chatgmlcpp:
self.model = self.model.eval()
清空顯存
在加載模型前先清空顯存loader.py
def unload_model(self):
del self.model
del self.tokenizer
self.model = self.tokenizer = None
self.clear_torch_cache()
def clear_torch_cache(self):
# 垃圾回收, 避免內(nèi)存泄漏和優(yōu)化內(nèi)存使用
gc.collect()
if self.llm_device.lower() != "cpu":
# 檢測(cè)系統(tǒng)是否支持MPS,這是是Apple在Mac設(shè)備上用于GPU加速的框架
if torch.has_mps:
try:
from torch.mps import empty_cache
empty_cache()
except Exception as e:
print(e)
print(
"如果您使用的是 macOS 建議將 pytorch 版本升級(jí)至 2.0.0 或更高版本,以支持及時(shí)清理 torch 產(chǎn)生的內(nèi)存占用。")
elif torch.has_cuda:
device_id = "0" if torch.cuda.is_available() and (":" not in self.llm_device) else None
CUDA_DEVICE = f"{self.llm_device}:{device_id}" if device_id else self.llm_device
with torch.cuda.device(CUDA_DEVICE):
# 釋放GPU顯存緩存中的任何未使用的內(nèi)存。
# PyTorch在GPU上申請(qǐng)和釋放內(nèi)存時(shí),部分內(nèi)存會(huì)保留在緩存中重復(fù)利用,
# empty_cache()可以釋放這些緩存memory。
torch.cuda.empty_cache()
# 用于CUDA IPC內(nèi)存共享的垃圾回收。
# 在多進(jìn)程GPU訓(xùn)練中,進(jìn)程間會(huì)共享部分內(nèi)存,
# ipc_collect()可以顯式收集共享內(nèi)存垃圾。
torch.cuda.ipc_collect()
else:
print("未檢測(cè)到 cuda 或 mps,暫不支持清理顯存")
加載模型調(diào)用鏈
loader.py
的_load_model
方法
model = LoaderClass.from_pretrained(checkpoint,
config=self.model_config,
torch_dtype=torch.bfloat16 if self.bf16 else torch.float16,
trust_remote_code=True).half()
auto_factory.py
的from_pretrained
方法
包路徑:site-packages/transformers/models/auto/auto_factory.py
作用:將配置對(duì)象的類與模型類或?qū)ο蠼㈥P(guān)聯(lián),以便根據(jù)配置來(lái)獲取相應(yīng)的模型類或?qū)ο蟆_@通常用于管理不同配置下的模型選擇和實(shí)例化。例如,根據(jù)不同的配置選擇不同的模型架構(gòu)或模型參數(shù)。
cls.register(config.__class__, model_class, exist_ok=True)
modeling_utils.py
的from_pretrained
方法
包路徑:site-packages/transformers/modeling_utils.py
作用:因?yàn)闆](méi)有顯式指定模型路徑,所以只能通過(guò)緩存方式下載和加載。
resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
# Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None
# result when internet is up, the repo and revision exist, but the file does not.
if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant):
# Maybe the checkpoint is sharded, we try to grab the index name in this case.
resolved_archive_file = cached_file(
pretrained_model_name_or_path,
_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant),
**cached_file_kwargs,
)
...
# We'll need to download and cache each checkpoint shard if the checkpoint is sharded.
if is_sharded:
# rsolved_archive_file becomes a list of files that point to the different checkpoint shards in this case.
resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
pretrained_model_name_or_path,
resolved_archive_file,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
local_files_only=local_files_only,
use_auth_token=token,
user_agent=user_agent,
revision=revision,
subfolder=subfolder,
_commit_hash=commit_hash,
)
hub.py
的get_checkpoint_shard_files
方法
包路徑:site-packages/transformers/utils/hub.py
作用:第一次啟動(dòng)項(xiàng)目時(shí)下載模型到本地緩存。
for shard_filename in tqdm(shard_filenames, desc="Downloading shards", disable=not show_progress_bar):
try:
# Load from URL
cached_filename = cached_file(
pretrained_model_name_or_path,
shard_filename,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
local_files_only=local_files_only,
use_auth_token=use_auth_token,
user_agent=user_agent,
revision=revision,
subfolder=subfolder,
_commit_hash=_commit_hash,
)
modeling_utils.py
的_load_pretrained_mode
方法
包路徑:site-packages/transformers/modeling_utils.py
作用:遍歷權(quán)重文件分片,逐一加載這些分片,但會(huì)跳過(guò)那些只包含磁盤(pán)上載權(quán)重的分片文件,顯示加載的進(jìn)度條,也就是下面這個(gè)東西,但此時(shí)模型權(quán)重還沒(méi)有加載到顯存中
if len(resolved_archive_file) > 1:
resolved_archive_file = logging.tqdm(resolved_archive_file, desc="Loading checkpoint shards")
for shard_file in resolved_archive_file:
# Skip the load for shards that only contain disk-offloaded weights when using safetensors for the offload.
if shard_file in disk_only_shard_files:
continue
state_dict = load_state_dict(shard_file)
回到loader.py
的_load_model
方法
這里主要是為了把模型加載到顯存,可以使用多卡加載方式
else:
# 基于如下方式作為默認(rèn)的多卡加載方案針對(duì)新模型基本不會(huì)失敗
# 在chatglm2-6b,bloom-3b,blooz-7b1上進(jìn)行了測(cè)試,GPU負(fù)載也相對(duì)均衡
from accelerate.utils import get_balanced_memory
max_memory = get_balanced_memory(model,
dtype=torch.int8 if self.load_in_8bit else None,
low_zero=False,
no_split_module_classes=model._no_split_modules)
self.device_map = infer_auto_device_map(model,
dtype=torch.float16 if not self.load_in_8bit else torch.int8,
max_memory=max_memory,
no_split_module_classes=model._no_split_modules)
model = dispatch_model(model, device_map=self.device_map)
-
未執(zhí)行上述代碼之前,顯存占用為0
-
執(zhí)行max_memory = get_balanced_memory(…):在這一部分代碼中,通過(guò)調(diào)用 get_balanced_memory 函數(shù)來(lái)獲取一個(gè)適當(dāng)?shù)膬?nèi)存分配方案,執(zhí)行完后每個(gè)卡都會(huì)產(chǎn)生少量的顯存占用
- 執(zhí)行self.device_map = infer_auto_device_map(…):根據(jù)模型、數(shù)據(jù)類型、內(nèi)存分配等信息來(lái)推斷設(shè)備映射,將模型的不同部分分配到不同的設(shè)備上進(jìn)行計(jì)算。
- 執(zhí)行model = dispatch_model(model, device_map=self.device_map):根據(jù)生成的設(shè)備映射 將模型的不同部分分配到不同的設(shè)備上進(jìn)行計(jì)算。這樣,模型就可以利用多個(gè)GPU并行計(jì)算,以提高計(jì)算性能,模型權(quán)重被全部加載到顯存。
不過(guò)需要注意的一點(diǎn)是,目前這種多卡模型加載存在bug,一問(wèn)問(wèn)題就崩,建議指定單卡加載文章來(lái)源:http://www.zghlxwxcb.cn/news/detail-658110.html
文章來(lái)源地址http://www.zghlxwxcb.cn/news/detail-658110.html
到了這里,關(guān)于langchain-ChatGLM源碼閱讀:模型加載的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!