diff --git a/frameworks/LLaMA-Factory/0.9.2/Dockerfile b/frameworks/LLaMA-Factory/0.9.2/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..6c98ef4b62b336c17748d82facf86a1121a79e9b --- /dev/null +++ b/frameworks/LLaMA-Factory/0.9.2/Dockerfile @@ -0,0 +1,108 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="stronking 363133710@qq.com" +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="LLaMA-Factory 0.9.2 on OpenCloudOS 9 with Python 3.11, CUDA 12.8" +ARG LLAMAFACTORY_VERSION=0.9.2 +ARG VLLM_VERSION=0.7.0 +ENV NVIDIA_VISIBLE_DEVICES=all \ + TZ=Asia/Shanghai \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + CUDA_HOME=/usr/local/cuda \ + PATH=/opt/venv/bin:/usr/local/cuda/bin:$PATH \ + LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-} \ + HF_HOME=/data/huggingface \ + MODE=webui \ + WEBUI_PORT=7860 \ + API_PORT=8000 \ + API_CONFIG=/workspace/config/api.yaml + + +WORKDIR /workspace + + +RUN --mount=type=cache,id=pip-cache-opencloudos9-cu128,target=/root/.cache/pip \ + pip install --upgrade pip setuptools wheel virtualenv + +RUN --mount=type=cache,id=pip-cache-opencloudos9-cu128,target=/root/.cache/pip \ + pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 + +# 安装指定版本 LLaMA-Factory 和 vLLM +RUN --mount=type=cache,id=pip-cache-opencloudos9-cu128,target=/root/.cache/pip \ + pip install "llamafactory[torch,metrics]==${LLAMAFACTORY_VERSION}" \ + && pip install deepspeed==0.15.4 vllm==${VLLM_VERSION} \ + && pip install --force-reinstall \ + "pydantic==2.10.6" \ + "fastapi==0.112.4" \ + "starlette==0.37.2" \ + "gradio==4.44.1" \ + "gradio_client==1.3.0" \ + "huggingface-hub>=0.26.0,<1.0" + +RUN mkdir -p /workspace/config /data/huggingface /data/models /data/datasets /data/outputs + +RUN cat > /usr/local/bin/start-llamafactory.sh <<'EOF' +#!/usr/bin/env bash +set -e + +echo "MODE=${MODE}" +echo "LLAMAFACTORY_VERSION=$(python -c 'import llamafactory; print(getattr(llamafactory, "__version__", "unknown"))')" +echo "WEBUI_PORT=${WEBUI_PORT}" +echo "API_PORT=${API_PORT}" +echo "API_CONFIG=${API_CONFIG}" + +case "${MODE}" in + webui) + exec llamafactory-cli webui \ + --host 0.0.0.0 \ + --port "${WEBUI_PORT}" + ;; + + api) + if [ ! -f "${API_CONFIG}" ]; then + echo "ERROR: API_CONFIG not found: ${API_CONFIG}" + exit 1 + fi + + exec llamafactory-cli api "${API_CONFIG}" \ + api_host=0.0.0.0 \ + api_port="${API_PORT}" + ;; + + all) + llamafactory-cli webui \ + --host 0.0.0.0 \ + --port "${WEBUI_PORT}" & + + if [ ! -f "${API_CONFIG}" ]; then + echo "ERROR: MODE=all 需要 API_CONFIG: ${API_CONFIG}" + exit 1 + fi + + exec llamafactory-cli api "${API_CONFIG}" \ + api_host=0.0.0.0 \ + api_port="${API_PORT}" + ;; + + bash) + exec /bin/bash + ;; + + *) + echo "Unsupported MODE: ${MODE}" + echo "可选:webui | api | all | bash" + exit 1 + ;; +esac +EOF + +RUN chmod +x /usr/local/bin/start-llamafactory.sh + +EXPOSE 7860 8000 + +VOLUME ["/workspace", "/data/huggingface", "/data/models", "/data/datasets", "/data/outputs"] + +CMD ["/usr/local/bin/start-llamafactory.sh"] \ No newline at end of file diff --git a/frameworks/LLaMA-Factory/0.9.2/README.md b/frameworks/LLaMA-Factory/0.9.2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d8df94f0ce85170849bcfb3ca4d3b8f654b5f464 --- /dev/null +++ b/frameworks/LLaMA-Factory/0.9.2/README.md @@ -0,0 +1,186 @@ +# LLaMA-Factory on OpenCloudOS 9 + +## 基本信息 + +- **框架名称**:LLaMA-Factory +- **框架版本**:v0.9.2 +- **基础镜像**:opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**:3.11 +- **CUDA 版本**:12.8 + +## 功能说明 + +该镜像基于 OpenCloudOS 9 与 CUDA 12.8 构建,集成: + +- LLaMA-Factory +- Web UI(Gradio) +- OpenAI-compatible API +- PyTorch CUDA 12.8 +- DeepSpeed +- vLLM + +支持: + +- LoRA / QLoRA 微调 +- SFT / DPO / PPO 等训练方式 +- HuggingFace 模型加载 +- 多 GPU 训练与推理 +- OpenAI 风格接口服务 + +--- + +## 构建镜像 + +```bash +DOCKER_BUILDKIT=1 docker build \ + --build-arg LLAMAFACTORY_VERSION=0.9.2 \ + -t oc9-llamafactory:0.9.2 . + +``` + +## 查看版本 +```bash +docker run --rm \ + oc9-llamafactory:0.9.2 \ + python -c "import llamafactory; print(llamafactory.__version__)" +``` + +## 启动Web UI + +```bash +docker run -it --rm \ + --gpus all \ + --ipc=host \ + -p 7860:7860 \ + -v ./workspace:/workspace \ + -v ./data:/data \ + -e MODE=webui \ + oc9-llamafactory:0.9.2 +``` +启动后访问地址: +```aiignore +http://localhost:7860 +``` + +## 启动 API 服务 + +### 准备API配置文件 +```bash +mkdir -p workspace/config +``` +#### 示例 + +```yaml +model_name_or_path: /data/models/Qwen/Qwen2.5-7B-Instruct +template: qwen +infer_backend: vllm +finetuning_type: lora +``` + +### 启动命令 + +```bash +docker run -it --rm \ + --gpus all \ + --ipc=host \ + -p 8000:8000 \ + -v ./workspace:/workspace \ + -v ./data:/data \ + -e MODE=api \ + -e API_CONFIG=/workspace/config/api.yaml \ + oc9-llamafactory:0.9.2 +``` + +### 访问API地址 + +~~~ +http://localhost:8000 +~~~ + +## 同时启动 Web UI 与 API + +```bash +docker run -it --rm \ + --gpus all \ + --ipc=host \ + -p 7860:7860 \ + -p 8000:8000 \ + -v ./workspace:/workspace \ + -v ./data:/data \ + -e MODE=all \ + -e API_CONFIG=/workspace/config/api.yaml \ + oc9-llamafactory:0.9.2 +``` + +## 目录说明 + +| 容器目录 | 说明 | +| ------------------- | -------------- | +| `/workspace` | 工作目录 | +| `/workspace/config` | API 配置文件 | +| `/data/models` | 模型目录 | +| `/data/datasets` | 数据集目录 | +| `/data/outputs` | 训练输出目录 | +| `/data/huggingface` | HuggingFace 缓存 | + + +## 环境变量 + +| 变量名 | 默认值 | 说明 | +| ------------ | ---------------------------- | ----------------------- | +| `MODE` | `webui` | 启动模式:webui/api/all/bash | +| `WEBUI_PORT` | `7860` | Web UI 端口 | +| `API_PORT` | `8000` | API 端口 | +| `API_CONFIG` | `/workspace/config/api.yaml` | API 配置文件 | + +## 使用示例 + +### 进入容器 + +```bash +docker run -it --rm \ + --gpus all \ + oc9-llamafactory:0.9.2 \ + bash +``` + +查看GPU信息: + +```bash +nvidia-smi +``` + +启动命令行训练: + +```bash +llamafactory-cli train examples/train_lora/qwen_lora_sft.yaml +``` + +--- + +## 已知问题 + +* vLLM 对 CUDA / PyTorch / 驱动版本较敏感 +* 首次启动会下载 HuggingFace 模型,耗时较长 +* DeepSpeed 在部分驱动版本上可能需要额外 NCCL 配置 +* CUDA 12.8 需宿主机 NVIDIA Driver >= 550 + +--- + +## 推荐宿主机环境 + +| 项目 | 建议 | +| ------------------------ | -------- | +| Docker | >= 24 | +| NVIDIA Driver | >= 550 | +| NVIDIA Container Toolkit | 已安装 | +| GPU 显存 | >= 24GB | +| 磁盘空间 | >= 100GB | + +--- + +## 参考项目 + +* [https://github.com/hiyouga/LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) +* [https://hub.docker.com/r/hiyouga/llamafactory](https://hub.docker.com/r/hiyouga/llamafactory) + diff --git a/frameworks/LLaMA-Factory/0.9.2/build.conf b/frameworks/LLaMA-Factory/0.9.2/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..f469e1135386ad4c2abdc68bcba93ef65f71d481 --- /dev/null +++ b/frameworks/LLaMA-Factory/0.9.2/build.conf @@ -0,0 +1,4 @@ +# LLaMA-Factory 0.9.2 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-llama-factory +IMAGE_TAG=0.9.2 +GPU_TEST=true \ No newline at end of file diff --git a/frameworks/LLaMA-Factory/0.9.2/test.sh b/frameworks/LLaMA-Factory/0.9.2/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..d5606369b7c4a02e1101f0a8711d184f9ee8b619 --- /dev/null +++ b/frameworks/LLaMA-Factory/0.9.2/test.sh @@ -0,0 +1,113 @@ +#!/bin/bash +set -e + +IMAGE="${1:?ERROR: 缺少镜像参数。用法: bash test.sh }" + +echo "=== LLaMA-Factory 镜像功能测试 ===" +echo "测试镜像: ${IMAGE}" +echo + +run_test() { + local name="$1" + shift + + echo "${name}..." + if "$@"; then + echo "✓ 通过" + echo + else + echo "✗ 失败" + exit 1 + fi +} + +# 1. 验证 nvidia-smi +run_test "检查 nvidia-smi" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + bash -lc 'nvidia-smi | head -n 10' + +# 2. 验证 nvcc +run_test "检查 nvcc --version" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + bash -lc 'nvcc --version | head -n 5' + +# 3. 验证 CUDA / PyTorch +run_test "检查 PyTorch CUDA" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + python3 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print('torch:', torch.__version__) +print('cuda:', torch.version.cuda) +print('gpu:', torch.cuda.get_device_name(0)) +" + +# 4. 验证 LLaMA-Factory import +run_test "检查 LLaMA-Factory import" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + python3 -c " +import llamafactory +print('llamafactory:', getattr(llamafactory, '__version__', 'unknown')) +" + +# 5. 验证 llamafactory-cli +run_test "检查 llamafactory-cli version" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + llamafactory-cli version + +# 6. 验证 Web UI 是否能启动 +run_test "检查 Web UI 启动" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + bash -lc ' +set -e + +LOG=/tmp/llamafactory-webui.log + +llamafactory-cli webui > "$LOG" 2>&1 & +PID=$! + +cleanup() { + kill "$PID" >/dev/null 2>&1 || true +} +trap cleanup EXIT + +for i in $(seq 1 40); do + if grep -q "Running on local URL" "$LOG"; then + cat "$LOG" + exit 0 + fi + + if ! kill -0 "$PID" >/dev/null 2>&1; then + cat "$LOG" + exit 1 + fi + + sleep 1 +done + +cat "$LOG" +exit 1 +' + +# 7. 验证 API 命令 +run_test "检查 API 命令" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + bash -lc 'llamafactory-cli api --help | head -n 5' + +# 8. 验证 vLLM import +run_test "检查 vLLM import" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + python3 -c " +import vllm +print('vllm:', vllm.__version__) +" + +# 9. 验证核心推理依赖 transformers +run_test "检查 Transformers import" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + python3 -c " +import transformers +print('transformers:', transformers.__version__) +" + +echo "=== 所有测试通过 ===" \ No newline at end of file diff --git a/frameworks/LLaMA-Factory/0.9.2/test_result.png b/frameworks/LLaMA-Factory/0.9.2/test_result.png new file mode 100644 index 0000000000000000000000000000000000000000..e5e08b489fd0ee32ddea0b4bbf61f515149567dc Binary files /dev/null and b/frameworks/LLaMA-Factory/0.9.2/test_result.png differ diff --git a/frameworks/LLaMA-Factory/0.9.3/Dockerfile b/frameworks/LLaMA-Factory/0.9.3/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..6d07023cb321468f08e5fc132e5dd95609f36254 --- /dev/null +++ b/frameworks/LLaMA-Factory/0.9.3/Dockerfile @@ -0,0 +1,105 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="stronking 363133710@qq.com" +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="LLaMA-Factory 0.9.3 on OpenCloudOS 9 with Python 3.11, CUDA 12.8" + +ARG LLAMAFACTORY_VERSION=0.9.3 +ARG VLLM_VERSION=0.9.1 + +ENV NVIDIA_VISIBLE_DEVICES=all \ + TZ=Asia/Shanghai \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + CUDA_HOME=/usr/local/cuda \ + PATH=/opt/venv/bin:/usr/local/cuda/bin:$PATH \ + LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-} \ + HF_HOME=/data/huggingface \ + MODE=webui \ + WEBUI_PORT=7860 \ + API_PORT=8000 \ + API_CONFIG=/workspace/config/api.yaml + +WORKDIR /workspace + +RUN --mount=type=cache,id=pip-cache-opencloudos9-cu128,target=/root/.cache/pip \ + pip install --upgrade pip setuptools wheel virtualenv + +# PyTorch CUDA 12.8 +RUN --mount=type=cache,id=pip-cache-opencloudos9-cu128,target=/root/.cache/pip \ + pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 \ + --index-url https://download.pytorch.org/whl/cu128 + +# 安装指定版本 LLaMA-Factory 和 vLLM +RUN --mount=type=cache,id=pip-cache-opencloudos9-cu128,target=/root/.cache/pip \ + pip install "llamafactory[torch,metrics]==${LLAMAFACTORY_VERSION}" \ + && pip install deepspeed==0.16.1 vllm==${VLLM_VERSION} \ + && pip install gradio==5.31.0 gradio_client==1.10.1 + + +RUN mkdir -p /workspace/config /data/huggingface /data/models /data/datasets /data/outputs + +RUN cat > /usr/local/bin/start-llamafactory.sh <<'EOF' +#!/usr/bin/env bash +set -e + +echo "MODE=${MODE}" +echo "LLAMAFACTORY_VERSION=$(python -c 'import llamafactory; print(getattr(llamafactory, "__version__", "unknown"))')" +echo "WEBUI_PORT=${WEBUI_PORT}" +echo "API_PORT=${API_PORT}" +echo "API_CONFIG=${API_CONFIG}" + +case "${MODE}" in + webui) + exec llamafactory-cli webui \ + --host 0.0.0.0 \ + --port "${WEBUI_PORT}" + ;; + + api) + if [ ! -f "${API_CONFIG}" ]; then + echo "ERROR: API_CONFIG not found: ${API_CONFIG}" + exit 1 + fi + + exec llamafactory-cli api "${API_CONFIG}" \ + api_host=0.0.0.0 \ + api_port="${API_PORT}" + ;; + + all) + llamafactory-cli webui \ + --host 0.0.0.0 \ + --port "${WEBUI_PORT}" & + + if [ ! -f "${API_CONFIG}" ]; then + echo "ERROR: MODE=all 需要 API_CONFIG: ${API_CONFIG}" + exit 1 + fi + + exec llamafactory-cli api "${API_CONFIG}" \ + api_host=0.0.0.0 \ + api_port="${API_PORT}" + ;; + + bash) + exec /bin/bash + ;; + + *) + echo "Unsupported MODE: ${MODE}" + echo "可选:webui | api | all | bash" + exit 1 + ;; +esac +EOF + +RUN chmod +x /usr/local/bin/start-llamafactory.sh + +EXPOSE 7860 8000 + +VOLUME ["/workspace", "/data/huggingface", "/data/models", "/data/datasets", "/data/outputs"] + +CMD ["/usr/local/bin/start-llamafactory.sh"] \ No newline at end of file diff --git a/frameworks/LLaMA-Factory/0.9.3/README.md b/frameworks/LLaMA-Factory/0.9.3/README.md new file mode 100644 index 0000000000000000000000000000000000000000..701cbe06f996c46973defcd22d94c0ab95eb69d8 --- /dev/null +++ b/frameworks/LLaMA-Factory/0.9.3/README.md @@ -0,0 +1,186 @@ +# LLaMA-Factory on OpenCloudOS 9 + +## 基本信息 + +- **框架名称**:LLaMA-Factory +- **框架版本**:v0.9.3 +- **基础镜像**:opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**:3.11 +- **CUDA 版本**:12.8 + +## 功能说明 + +该镜像基于 OpenCloudOS 9 与 CUDA 12.8 构建,集成: + +- LLaMA-Factory +- Web UI(Gradio) +- OpenAI-compatible API +- PyTorch CUDA 12.8 +- DeepSpeed +- vLLM + +支持: + +- LoRA / QLoRA 微调 +- SFT / DPO / PPO 等训练方式 +- HuggingFace 模型加载 +- 多 GPU 训练与推理 +- OpenAI 风格接口服务 + +--- + +## 构建镜像 + +```bash +DOCKER_BUILDKIT=1 docker build \ + --build-arg LLAMAFACTORY_VERSION=0.9.3 \ + -t oc9-llamafactory:0.9.3 . + +``` + +## 查看版本 +```bash +docker run --rm \ + oc9-llamafactory:0.9.3 \ + python -c "import llamafactory; print(llamafactory.__version__)" +``` + +## 启动Web UI + +```bash +docker run -it --rm \ + --gpus all \ + --ipc=host \ + -p 7860:7860 \ + -v ./workspace:/workspace \ + -v ./data:/data \ + -e MODE=webui \ + oc9-llamafactory:0.9.3 +``` +启动后访问地址: +```aiignore +http://localhost:7860 +``` + +## 启动 API 服务 + +### 准备API配置文件 +```bash +mkdir -p workspace/config +``` +#### 示例 + +```yaml +model_name_or_path: /data/models/Qwen/Qwen2.5-7B-Instruct +template: qwen +infer_backend: vllm +finetuning_type: lora +``` + +### 启动命令 + +```bash +docker run -it --rm \ + --gpus all \ + --ipc=host \ + -p 8000:8000 \ + -v ./workspace:/workspace \ + -v ./data:/data \ + -e MODE=api \ + -e API_CONFIG=/workspace/config/api.yaml \ + oc9-llamafactory:0.9.3 +``` + +### 访问API地址 + +~~~ +http://localhost:8000 +~~~ + +## 同时启动 Web UI 与 API + +```bash +docker run -it --rm \ + --gpus all \ + --ipc=host \ + -p 7860:7860 \ + -p 8000:8000 \ + -v ./workspace:/workspace \ + -v ./data:/data \ + -e MODE=all \ + -e API_CONFIG=/workspace/config/api.yaml \ + oc9-llamafactory:0.9.3 +``` + +## 目录说明 + +| 容器目录 | 说明 | +| ------------------- | -------------- | +| `/workspace` | 工作目录 | +| `/workspace/config` | API 配置文件 | +| `/data/models` | 模型目录 | +| `/data/datasets` | 数据集目录 | +| `/data/outputs` | 训练输出目录 | +| `/data/huggingface` | HuggingFace 缓存 | + + +## 环境变量 + +| 变量名 | 默认值 | 说明 | +| ------------ | ---------------------------- | ----------------------- | +| `MODE` | `webui` | 启动模式:webui/api/all/bash | +| `WEBUI_PORT` | `7860` | Web UI 端口 | +| `API_PORT` | `8000` | API 端口 | +| `API_CONFIG` | `/workspace/config/api.yaml` | API 配置文件 | + +## 使用示例 + +### 进入容器 + +```bash +docker run -it --rm \ + --gpus all \ + oc9-llamafactory:0.9.3 \ + bash +``` + +查看GPU信息: + +```bash +nvidia-smi +``` + +启动命令行训练: + +```bash +llamafactory-cli train examples/train_lora/qwen_lora_sft.yaml +``` + +--- + +## 已知问题 + +* vLLM 对 CUDA / PyTorch / 驱动版本较敏感 +* 首次启动会下载 HuggingFace 模型,耗时较长 +* DeepSpeed 在部分驱动版本上可能需要额外 NCCL 配置 +* CUDA 12.8 需宿主机 NVIDIA Driver >= 550 + +--- + +## 推荐宿主机环境 + +| 项目 | 建议 | +| ------------------------ | -------- | +| Docker | >= 24 | +| NVIDIA Driver | >= 550 | +| NVIDIA Container Toolkit | 已安装 | +| GPU 显存 | >= 24GB | +| 磁盘空间 | >= 100GB | + +--- + +## 参考项目 + +* [https://github.com/hiyouga/LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) +* [https://hub.docker.com/r/hiyouga/llamafactory](https://hub.docker.com/r/hiyouga/llamafactory) + diff --git a/frameworks/LLaMA-Factory/0.9.3/build.conf b/frameworks/LLaMA-Factory/0.9.3/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..9383c2c1d8cfe4b225c047658962242edaaa22a8 --- /dev/null +++ b/frameworks/LLaMA-Factory/0.9.3/build.conf @@ -0,0 +1,4 @@ +# LLaMA-Factory 0.9.3 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-llama-factory +IMAGE_TAG=0.9.3 +GPU_TEST=true \ No newline at end of file diff --git a/frameworks/LLaMA-Factory/0.9.3/test.sh b/frameworks/LLaMA-Factory/0.9.3/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..d5606369b7c4a02e1101f0a8711d184f9ee8b619 --- /dev/null +++ b/frameworks/LLaMA-Factory/0.9.3/test.sh @@ -0,0 +1,113 @@ +#!/bin/bash +set -e + +IMAGE="${1:?ERROR: 缺少镜像参数。用法: bash test.sh }" + +echo "=== LLaMA-Factory 镜像功能测试 ===" +echo "测试镜像: ${IMAGE}" +echo + +run_test() { + local name="$1" + shift + + echo "${name}..." + if "$@"; then + echo "✓ 通过" + echo + else + echo "✗ 失败" + exit 1 + fi +} + +# 1. 验证 nvidia-smi +run_test "检查 nvidia-smi" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + bash -lc 'nvidia-smi | head -n 10' + +# 2. 验证 nvcc +run_test "检查 nvcc --version" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + bash -lc 'nvcc --version | head -n 5' + +# 3. 验证 CUDA / PyTorch +run_test "检查 PyTorch CUDA" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + python3 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print('torch:', torch.__version__) +print('cuda:', torch.version.cuda) +print('gpu:', torch.cuda.get_device_name(0)) +" + +# 4. 验证 LLaMA-Factory import +run_test "检查 LLaMA-Factory import" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + python3 -c " +import llamafactory +print('llamafactory:', getattr(llamafactory, '__version__', 'unknown')) +" + +# 5. 验证 llamafactory-cli +run_test "检查 llamafactory-cli version" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + llamafactory-cli version + +# 6. 验证 Web UI 是否能启动 +run_test "检查 Web UI 启动" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + bash -lc ' +set -e + +LOG=/tmp/llamafactory-webui.log + +llamafactory-cli webui > "$LOG" 2>&1 & +PID=$! + +cleanup() { + kill "$PID" >/dev/null 2>&1 || true +} +trap cleanup EXIT + +for i in $(seq 1 40); do + if grep -q "Running on local URL" "$LOG"; then + cat "$LOG" + exit 0 + fi + + if ! kill -0 "$PID" >/dev/null 2>&1; then + cat "$LOG" + exit 1 + fi + + sleep 1 +done + +cat "$LOG" +exit 1 +' + +# 7. 验证 API 命令 +run_test "检查 API 命令" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + bash -lc 'llamafactory-cli api --help | head -n 5' + +# 8. 验证 vLLM import +run_test "检查 vLLM import" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + python3 -c " +import vllm +print('vllm:', vllm.__version__) +" + +# 9. 验证核心推理依赖 transformers +run_test "检查 Transformers import" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + python3 -c " +import transformers +print('transformers:', transformers.__version__) +" + +echo "=== 所有测试通过 ===" \ No newline at end of file diff --git a/frameworks/LLaMA-Factory/0.9.3/test_result.png b/frameworks/LLaMA-Factory/0.9.3/test_result.png new file mode 100644 index 0000000000000000000000000000000000000000..2dc41e76e26bb1e814941a67800abb1309131b38 Binary files /dev/null and b/frameworks/LLaMA-Factory/0.9.3/test_result.png differ diff --git a/frameworks/LLaMA-Factory/0.9.4/Dockerfile b/frameworks/LLaMA-Factory/0.9.4/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..f65ea6ce29482e4bb158bad10856ad9fb6017f15 --- /dev/null +++ b/frameworks/LLaMA-Factory/0.9.4/Dockerfile @@ -0,0 +1,106 @@ +FROM opencloudos/opencloudos9-cuda-devel:12.8 + +LABEL maintainer="stronking 363133710@qq.com" +LABEL org.opencontainers.image.source="https://gitee.com/OpenCloudOS/ai-agent-container" +LABEL org.opencontainers.image.description="LLaMA-Factory on OpenCloudOS 9 with Python 3.11, CUDA 12.8" + +ARG VLLM_VERSION=0.10.2 +ARG TORCH_INDEX_URL=https://download.pytorch.org/whl/cu128 +ARG LLAMAFACTORY_VERSION=0.9.4 + + +ENV NVIDIA_VISIBLE_DEVICES=all \ + TZ=Asia/Shanghai \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + CUDA_HOME=/usr/local/cuda \ + PATH=/opt/venv/bin:/usr/local/cuda/bin:$PATH \ + LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-} \ + HF_HOME=/data/huggingface \ + MODE=webui \ + WEBUI_PORT=7860 \ + API_PORT=8000 \ + API_CONFIG=/workspace/config/api.yaml + +WORKDIR /workspace + +RUN --mount=type=cache,id=pip-cache-opencloudos9-cu128,target=/root/.cache/pip \ + pip install --upgrade pip setuptools wheel virtualenv + +# PyTorch CUDA 12.8 +RUN --mount=type=cache,id=pip-cache-opencloudos9-cu128,target=/root/.cache/pip \ + pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 \ + --index-url https://download.pytorch.org/whl/cu128 + +# 安装指定版本 LLaMA-Factory 和vLLM +RUN --mount=type=cache,id=pip-cache-opencloudos9-cu128,target=/root/.cache/pip \ + pip install "llamafactory[torch,metrics]==${LLAMAFACTORY_VERSION}" \ + && pip install deepspeed==0.16.4 vllm==${VLLM_VERSION} \ + && pip install gradio==5.50.0 gradio_client==1.14.0 + +RUN mkdir -p /workspace/config /data/huggingface /data/models /data/datasets /data/outputs + +RUN cat > /usr/local/bin/start-llamafactory.sh <<'EOF' +#!/usr/bin/env bash +set -e + +echo "MODE=${MODE}" +echo "LLAMAFACTORY_VERSION=$(python -c 'import llamafactory; print(getattr(llamafactory, "__version__", "unknown"))')" +echo "WEBUI_PORT=${WEBUI_PORT}" +echo "API_PORT=${API_PORT}" +echo "API_CONFIG=${API_CONFIG}" + +case "${MODE}" in + webui) + exec llamafactory-cli webui \ + --host 0.0.0.0 \ + --port "${WEBUI_PORT}" + ;; + + api) + if [ ! -f "${API_CONFIG}" ]; then + echo "ERROR: API_CONFIG not found: ${API_CONFIG}" + exit 1 + fi + + exec llamafactory-cli api "${API_CONFIG}" \ + api_host=0.0.0.0 \ + api_port="${API_PORT}" + ;; + + all) + llamafactory-cli webui \ + --host 0.0.0.0 \ + --port "${WEBUI_PORT}" & + + if [ ! -f "${API_CONFIG}" ]; then + echo "ERROR: MODE=all 需要 API_CONFIG: ${API_CONFIG}" + exit 1 + fi + + exec llamafactory-cli api "${API_CONFIG}" \ + api_host=0.0.0.0 \ + api_port="${API_PORT}" + ;; + + bash) + exec /bin/bash + ;; + + *) + echo "Unsupported MODE: ${MODE}" + echo "可选:webui | api | all | bash" + exit 1 + ;; +esac +EOF + +RUN chmod +x /usr/local/bin/start-llamafactory.sh + +EXPOSE 7860 8000 + +VOLUME ["/workspace", "/data/huggingface", "/data/models", "/data/datasets", "/data/outputs"] + +CMD ["/usr/local/bin/start-llamafactory.sh"] \ No newline at end of file diff --git a/frameworks/LLaMA-Factory/0.9.4/README.md b/frameworks/LLaMA-Factory/0.9.4/README.md new file mode 100644 index 0000000000000000000000000000000000000000..71e895178c952cd47c025b3f66c87deb3d22b51a --- /dev/null +++ b/frameworks/LLaMA-Factory/0.9.4/README.md @@ -0,0 +1,186 @@ +# LLaMA-Factory on OpenCloudOS 9 + +## 基本信息 + +- **框架名称**:LLaMA-Factory +- **框架版本**:v0.9.4 +- **基础镜像**:opencloudos/opencloudos9-cuda-devel:12.8 +- **Python 版本**:3.11 +- **CUDA 版本**:12.8 + +## 功能说明 + +该镜像基于 OpenCloudOS 9 与 CUDA 12.8 构建,集成: + +- LLaMA-Factory +- Web UI(Gradio) +- OpenAI-compatible API +- PyTorch CUDA 12.8 +- DeepSpeed +- vLLM + +支持: + +- LoRA / QLoRA 微调 +- SFT / DPO / PPO 等训练方式 +- HuggingFace 模型加载 +- 多 GPU 训练与推理 +- OpenAI 风格接口服务 + +--- + +## 构建镜像 + +```bash +DOCKER_BUILDKIT=1 docker build \ + --build-arg LLAMAFACTORY_VERSION=0.9.4 \ + -t oc9-llamafactory:0.9.4 . + +``` + +## 查看版本 +```bash +docker run --rm \ + oc9-llamafactory:0.9.4 \ + python -c "import llamafactory; print(llamafactory.__version__)" +``` + +## 启动Web UI + +```bash +docker run -it --rm \ + --gpus all \ + --ipc=host \ + -p 7860:7860 \ + -v ./workspace:/workspace \ + -v ./data:/data \ + -e MODE=webui \ + oc9-llamafactory:0.9.4 +``` +启动后访问地址: +```aiignore +http://localhost:7860 +``` + +## 启动 API 服务 + +### 准备API配置文件 +```bash +mkdir -p workspace/config +``` +#### 示例 + +```yaml +model_name_or_path: /data/models/Qwen/Qwen2.5-7B-Instruct +template: qwen +infer_backend: vllm +finetuning_type: lora +``` + +### 启动命令 + +```bash +docker run -it --rm \ + --gpus all \ + --ipc=host \ + -p 8000:8000 \ + -v ./workspace:/workspace \ + -v ./data:/data \ + -e MODE=api \ + -e API_CONFIG=/workspace/config/api.yaml \ + oc9-llamafactory:0.9.4 +``` + +### 访问API地址 + +~~~ +http://localhost:8000 +~~~ + +## 同时启动 Web UI 与 API + +```bash +docker run -it --rm \ + --gpus all \ + --ipc=host \ + -p 7860:7860 \ + -p 8000:8000 \ + -v ./workspace:/workspace \ + -v ./data:/data \ + -e MODE=all \ + -e API_CONFIG=/workspace/config/api.yaml \ + oc9-llamafactory:0.9.4 +``` + +## 目录说明 + +| 容器目录 | 说明 | +| ------------------- | -------------- | +| `/workspace` | 工作目录 | +| `/workspace/config` | API 配置文件 | +| `/data/models` | 模型目录 | +| `/data/datasets` | 数据集目录 | +| `/data/outputs` | 训练输出目录 | +| `/data/huggingface` | HuggingFace 缓存 | + + +## 环境变量 + +| 变量名 | 默认值 | 说明 | +| ------------ | ---------------------------- | ----------------------- | +| `MODE` | `webui` | 启动模式:webui/api/all/bash | +| `WEBUI_PORT` | `7860` | Web UI 端口 | +| `API_PORT` | `8000` | API 端口 | +| `API_CONFIG` | `/workspace/config/api.yaml` | API 配置文件 | + +## 使用示例 + +### 进入容器 + +```bash +docker run -it --rm \ + --gpus all \ + oc9-llamafactory:0.9.4 \ + bash +``` + +查看GPU信息: + +```bash +nvidia-smi +``` + +启动命令行训练: + +```bash +llamafactory-cli train examples/train_lora/qwen_lora_sft.yaml +``` + +--- + +## 已知问题 + +* vLLM 对 CUDA / PyTorch / 驱动版本较敏感 +* 首次启动会下载 HuggingFace 模型,耗时较长 +* DeepSpeed 在部分驱动版本上可能需要额外 NCCL 配置 +* CUDA 12.8 需宿主机 NVIDIA Driver >= 550 + +--- + +## 推荐宿主机环境 + +| 项目 | 建议 | +| ------------------------ | -------- | +| Docker | >= 24 | +| NVIDIA Driver | >= 550 | +| NVIDIA Container Toolkit | 已安装 | +| GPU 显存 | >= 24GB | +| 磁盘空间 | >= 100GB | + +--- + +## 参考项目 + +* [https://github.com/hiyouga/LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) +* [https://hub.docker.com/r/hiyouga/llamafactory](https://hub.docker.com/r/hiyouga/llamafactory) + diff --git a/frameworks/LLaMA-Factory/0.9.4/build.conf b/frameworks/LLaMA-Factory/0.9.4/build.conf new file mode 100644 index 0000000000000000000000000000000000000000..33710cc14989f27427bf72a54327a2d53d0ac197 --- /dev/null +++ b/frameworks/LLaMA-Factory/0.9.4/build.conf @@ -0,0 +1,4 @@ +# LLaMA-Factory 0.9.4 on OpenCloudOS 9 (GPU) +IMAGE_NAME=oc9-llama-factory +IMAGE_TAG=0.9.4 +GPU_TEST=true \ No newline at end of file diff --git a/frameworks/LLaMA-Factory/0.9.4/test.sh b/frameworks/LLaMA-Factory/0.9.4/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..d5606369b7c4a02e1101f0a8711d184f9ee8b619 --- /dev/null +++ b/frameworks/LLaMA-Factory/0.9.4/test.sh @@ -0,0 +1,113 @@ +#!/bin/bash +set -e + +IMAGE="${1:?ERROR: 缺少镜像参数。用法: bash test.sh }" + +echo "=== LLaMA-Factory 镜像功能测试 ===" +echo "测试镜像: ${IMAGE}" +echo + +run_test() { + local name="$1" + shift + + echo "${name}..." + if "$@"; then + echo "✓ 通过" + echo + else + echo "✗ 失败" + exit 1 + fi +} + +# 1. 验证 nvidia-smi +run_test "检查 nvidia-smi" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + bash -lc 'nvidia-smi | head -n 10' + +# 2. 验证 nvcc +run_test "检查 nvcc --version" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + bash -lc 'nvcc --version | head -n 5' + +# 3. 验证 CUDA / PyTorch +run_test "检查 PyTorch CUDA" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + python3 -c " +import torch +assert torch.cuda.is_available(), 'CUDA not available' +print('torch:', torch.__version__) +print('cuda:', torch.version.cuda) +print('gpu:', torch.cuda.get_device_name(0)) +" + +# 4. 验证 LLaMA-Factory import +run_test "检查 LLaMA-Factory import" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + python3 -c " +import llamafactory +print('llamafactory:', getattr(llamafactory, '__version__', 'unknown')) +" + +# 5. 验证 llamafactory-cli +run_test "检查 llamafactory-cli version" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + llamafactory-cli version + +# 6. 验证 Web UI 是否能启动 +run_test "检查 Web UI 启动" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + bash -lc ' +set -e + +LOG=/tmp/llamafactory-webui.log + +llamafactory-cli webui > "$LOG" 2>&1 & +PID=$! + +cleanup() { + kill "$PID" >/dev/null 2>&1 || true +} +trap cleanup EXIT + +for i in $(seq 1 40); do + if grep -q "Running on local URL" "$LOG"; then + cat "$LOG" + exit 0 + fi + + if ! kill -0 "$PID" >/dev/null 2>&1; then + cat "$LOG" + exit 1 + fi + + sleep 1 +done + +cat "$LOG" +exit 1 +' + +# 7. 验证 API 命令 +run_test "检查 API 命令" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + bash -lc 'llamafactory-cli api --help | head -n 5' + +# 8. 验证 vLLM import +run_test "检查 vLLM import" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + python3 -c " +import vllm +print('vllm:', vllm.__version__) +" + +# 9. 验证核心推理依赖 transformers +run_test "检查 Transformers import" \ + docker run --rm --gpus all -e MODE=bash "$IMAGE" \ + python3 -c " +import transformers +print('transformers:', transformers.__version__) +" + +echo "=== 所有测试通过 ===" \ No newline at end of file diff --git a/frameworks/LLaMA-Factory/0.9.4/test_result.png b/frameworks/LLaMA-Factory/0.9.4/test_result.png new file mode 100644 index 0000000000000000000000000000000000000000..948788ba9020dbf99d86cf1505c878a5a5703ad2 Binary files /dev/null and b/frameworks/LLaMA-Factory/0.9.4/test_result.png differ