Instructions to use OpenNLPLab/TransNormerLLM-385M with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use OpenNLPLab/TransNormerLLM-385M with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="OpenNLPLab/TransNormerLLM-385M", trust_remote_code=True)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("OpenNLPLab/TransNormerLLM-385M", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use OpenNLPLab/TransNormerLLM-385M with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "OpenNLPLab/TransNormerLLM-385M"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "OpenNLPLab/TransNormerLLM-385M",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/OpenNLPLab/TransNormerLLM-385M

SGLang

How to use OpenNLPLab/TransNormerLLM-385M with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "OpenNLPLab/TransNormerLLM-385M" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "OpenNLPLab/TransNormerLLM-385M",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "OpenNLPLab/TransNormerLLM-385M" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "OpenNLPLab/TransNormerLLM-385M",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use OpenNLPLab/TransNormerLLM-385M with Docker Model Runner:
```
docker model run hf.co/OpenNLPLab/TransNormerLLM-385M
```

TransNormerLLM-385M / utils.py

OpenNLPLab

Publish 385M Model

e423fc2 over 2 years ago

raw

history blame contribute delete

3.77 kB

	import logging
	import os
	import sys

	import torch
	from torch import nn
	import torch.distributed as dist
	import torch.nn.functional as F

	from .norm import SimpleRMSNorm as SimpleRMSNormTorch
	from .srmsnorm_triton import SimpleRMSNorm as SimpleRMSNormTriton

	use_triton = eval(os.environ.get("use_triton", default="True"))
	debug = eval(os.environ.get("debug", default="False"))

	if use_triton:
	SimpleRMSNorm = SimpleRMSNormTriton
	else:
	SimpleRMSNorm = SimpleRMSNormTorch

	logging.basicConfig(
	format="%(asctime)s \| %(levelname)s \| %(name)s \| %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S",
	level=os.environ.get("LOGLEVEL", "INFO").upper(),
	stream=sys.stdout,
	)
	logger = logging.getLogger("print_config")

	BASE_DIM = 256


	def is_dist_avail_and_initialized():
	if not dist.is_available():
	return False
	if not dist.is_initialized():
	return False
	return True


	def get_world_size():
	if not is_dist_avail_and_initialized():
	return 1
	return dist.get_world_size()


	def get_rank():
	if not is_dist_avail_and_initialized():
	return 0
	return dist.get_rank()


	def is_main_process():
	return get_rank() == 0


	def logging_info(string):
	if is_main_process():
	logger.info(string)


	def print_params(**kwargs):
	if is_main_process():
	logger.info(f"start print config of {kwargs['__class__']}")
	for key in kwargs:
	if key in ["__class__", "self"]:
	continue
	logger.info(f"{key}: {kwargs[key]}")
	logger.info(f"end print config of {kwargs['__class__']}")


	def print_config(config):
	if is_main_process():
	logger.info(f"start print config of {config['__class__']}")
	for key in config:
	if key in ["__class__", "self"]:
	continue
	logger.info(f"{key}: {config[key]}")
	logger.info(f"end print config of {config['__class__']}")


	def print_module(module):
	named_modules = set()
	for p in module.named_modules():
	named_modules.update([p[0]])
	named_modules = list(named_modules)

	string_repr = ""
	for p in module.named_parameters():
	name = p[0].split(".")[0]
	if name not in named_modules:
	string_repr = (string_repr + "(" + name + "): " + "Tensor(" +
	str(tuple(p[1].shape)) + ", requires_grad=" +
	str(p[1].requires_grad) + ")\n")

	return string_repr.rstrip("\n")


	def get_activation_fn(activation):
	if debug:
	logger.info(f"activation: {activation}")
	if activation == "gelu":
	return F.gelu
	elif activation == "relu":
	return F.relu
	elif activation == "elu":
	return F.elu
	elif activation == "sigmoid":
	return F.sigmoid
	elif activation == "exp":

	def f(x):
	with torch.no_grad():
	x_max = torch.max(x, dim=-1, keepdims=True).values
	y = torch.exp(x - x_max)

	return y

	return f
	elif activation == "leak":
	return F.leaky_relu
	elif activation == "1+elu":

	def f(x):
	return 1 + F.elu(x)

	return f
	elif activation == "2+elu":

	def f(x):
	return 2 + F.elu(x)

	return f
	elif activation == "silu" or activation == "swish":
	return F.silu
	elif activation == "sine":
	return torch.sin
	else:
	logger.info(
	f"activation: does not support {activation}, use Identity!!!")
	return lambda x: x


	def get_norm_fn(norm_type):
	if norm_type == "simplermsnorm":
	return SimpleRMSNorm
	else:
	return nn.LayerNorm


	def convert_to_multiple_of_base(x):
	return BASE_DIM * ((x + BASE_DIM - 1) // BASE_DIM)