frankenstallm / data /build_korean_dataset.sh
pathcosmos's picture
feat: Add data pipeline scripts + phase reports (Tier 3 - reproducibility)
b3d361d verified
#!/usr/bin/env bash
# data/build_korean_dataset.sh
# ν•œκ΅­μ–΄ LLM ν•™μŠ΅ 데이터 전체 νŒŒμ΄ν”„λΌμΈ μžλ™ν™”
#
# μ‹€ν–‰ 방법:
# bash data/build_korean_dataset.sh
#
# 단계:
# 1. CC-100 Korean λ‹€μš΄λ‘œλ“œ
# 2. mC4 Korean λ‹€μš΄λ‘œλ“œ
# 3. Namuwiki λ‹€μš΄λ‘œλ“œ
# 4. SentencePiece ν† ν¬λ‚˜μ΄μ € ν•™μŠ΅ (tokenizer/train_sp_tokenizer.py)
# 5. SP β†’ HuggingFace tokenizers.json λ³€ν™˜
# 6. 각 μ†ŒμŠ€ ν† ν¬λ‚˜μ΄μ§• (prepare.py)
# 7. .bin 파일 병합 (merge_bins.py)
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
cd "$PROJECT_ROOT"
# ─── μ„€μ • ─────────────────────────────────────────────────────────────────
RAW_DIR="data/raw"
BIN_DIR="data"
TOKENIZER_DIR="tokenizer/korean_sp"
VOCAB_SIZE=64000
# CC-100: 1,000만 ν–‰ (~1.5B 토큰) β€” μ „μ²΄λŠ” 80M+ ν–‰μ΄λ―€λ‘œ λ¨Όμ € μƒ˜ν”Œ
CC100_MAX_ROWS=10000000
C4_MAX_ROWS=5000000
echo "=== ν•œκ΅­μ–΄ LLM 데이터 νŒŒμ΄ν”„λΌμΈ ==="
echo "μž‘μ—… 디렉토리: $PROJECT_ROOT"
echo ""
# ─── Step 1: CC-100 Korean λ‹€μš΄λ‘œλ“œ ──────────────────────────────────────
echo "[1/7] CC-100 Korean λ‹€μš΄λ‘œλ“œ..."
mkdir -p "$RAW_DIR/cc100_ko"
python data/download.py \
--dataset cc100 \
--subset ko \
--text_col text \
--output_dir "$RAW_DIR/cc100_ko" \
--shard_size 100000 \
--max_rows $CC100_MAX_ROWS
echo ""
# ─── Step 2: mC4 Korean λ‹€μš΄λ‘œλ“œ ─────────────────────────────────────────
echo "[2/7] mC4 Korean λ‹€μš΄λ‘œλ“œ..."
mkdir -p "$RAW_DIR/c4_ko"
python data/download.py \
--dataset allenai/c4 \
--subset ko \
--split train \
--text_col text \
--output_dir "$RAW_DIR/c4_ko" \
--shard_size 100000 \
--max_rows $C4_MAX_ROWS
echo ""
# ─── Step 3: Namuwiki λ‹€μš΄λ‘œλ“œ ───────────────────────────────────────────
echo "[3/7] Namuwiki λ‹€μš΄λ‘œλ“œ..."
mkdir -p "$RAW_DIR/namuwiki_ko"
python data/download.py \
--dataset heegyu/namuwiki-extracted \
--text_col text \
--output_dir "$RAW_DIR/namuwiki_ko" \
--shard_size 100000
echo ""
# ─── Step 4: SentencePiece ν† ν¬λ‚˜μ΄μ € ν•™μŠ΅ ──────────────────────────────
echo "[4/7] SentencePiece Unigram ν† ν¬λ‚˜μ΄μ € ν•™μŠ΅ (vocab=$VOCAB_SIZE)..."
mkdir -p "$TOKENIZER_DIR"
# Namuwiki(μ†Œν˜•, 빠름) + ko_wiki(κΈ°μ‘΄)λ₯Ό μ‹œλ“œ ν…μŠ€νŠΈλ‘œ μ‚¬μš©
INPUT_FOR_SP=""
for dir in "$RAW_DIR/namuwiki_ko" "data/raw"; do
txts=$(find "$dir" -maxdepth 1 -name "*.txt" 2>/dev/null | head -20 | tr '\n' ',')
INPUT_FOR_SP="${INPUT_FOR_SP}${txts}"
done
INPUT_FOR_SP="${INPUT_FOR_SP%,}" # trailing comma 제거
python tokenizer/train_sp_tokenizer.py \
--input "$INPUT_FOR_SP" \
--vocab_size $VOCAB_SIZE \
--output_dir "$TOKENIZER_DIR"
echo ""
# ─── Step 5: SP β†’ HF tokenizers.json λ³€ν™˜ ───────────────────────────────
echo "[5/7] SentencePiece β†’ HuggingFace tokenizers.json λ³€ν™˜..."
python tokenizer/convert_sp_to_hf.py \
--model "$TOKENIZER_DIR/tokenizer.model" \
--output "$TOKENIZER_DIR/tokenizer.json"
echo ""
# ─── Step 6: ν† ν¬λ‚˜μ΄μ§• ──────────────────────────────────────────────────
echo "[6/7] 데이터 ν† ν¬λ‚˜μ΄μ§•..."
python data/prepare.py \
--input "$RAW_DIR/cc100_ko/*.txt" \
--output "$BIN_DIR/korean_cc100_train.bin" \
--tokenizer "$TOKENIZER_DIR/tokenizer.json" \
--val_split 0.002 \
--seed 42
python data/prepare.py \
--input "$RAW_DIR/c4_ko/*.txt" \
--output "$BIN_DIR/korean_c4_train.bin" \
--tokenizer "$TOKENIZER_DIR/tokenizer.json" \
--val_split 0.002 \
--seed 43
python data/prepare.py \
--input "$RAW_DIR/namuwiki_ko/*.txt" \
--output "$BIN_DIR/korean_namuwiki_train.bin" \
--tokenizer "$TOKENIZER_DIR/tokenizer.json" \
--val_split 0.002 \
--seed 44
echo ""
# ─── Step 7: .bin 병합 ────────────────────────────────────────────────────
echo "[7/7] ν•™μŠ΅ 데이터 병합..."
# ν›ˆλ ¨ μ…‹ 병합
TRAIN_BINS=$(ls "$BIN_DIR"/korean_*_train.bin 2>/dev/null | tr '\n' ' ')
if [ -n "$TRAIN_BINS" ]; then
python data/merge_bins.py $TRAIN_BINS "$BIN_DIR/korean_train.bin"
fi
# 검증 μ…‹ 병합
VAL_BINS=$(ls "$BIN_DIR"/korean_*_val.bin 2>/dev/null | tr '\n' ' ')
if [ -n "$VAL_BINS" ]; then
python data/merge_bins.py $VAL_BINS "$BIN_DIR/korean_val.bin"
fi
echo ""
echo "=== μ™„λ£Œ ==="
echo "ν•™μŠ΅ 데이터: $BIN_DIR/korean_train.bin"
echo "검증 데이터: $BIN_DIR/korean_val.bin"
echo "ν† ν¬λ‚˜μ΄μ €: $TOKENIZER_DIR/tokenizer.json"
echo ""
echo "λ‹€μŒ 단계:"
echo " python3 -c \""
echo " import numpy as np"
echo " d = np.memmap('$BIN_DIR/korean_train.bin', dtype='uint16', mode='r')"
echo " print(f'총 토큰: {len(d):,}')"
echo " \""