| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" |
| cd "$PROJECT_ROOT" |
|
|
| |
| RAW_DIR="data/raw" |
| BIN_DIR="data" |
| TOKENIZER_DIR="tokenizer/korean_sp" |
| VOCAB_SIZE=64000 |
|
|
| |
| CC100_MAX_ROWS=10000000 |
| C4_MAX_ROWS=5000000 |
|
|
| echo "=== νκ΅μ΄ LLM λ°μ΄ν° νμ΄νλΌμΈ ===" |
| echo "μμ
λλ ν 리: $PROJECT_ROOT" |
| echo "" |
|
|
| |
| echo "[1/7] CC-100 Korean λ€μ΄λ‘λ..." |
| mkdir -p "$RAW_DIR/cc100_ko" |
| python data/download.py \ |
| --dataset cc100 \ |
| --subset ko \ |
| --text_col text \ |
| --output_dir "$RAW_DIR/cc100_ko" \ |
| --shard_size 100000 \ |
| --max_rows $CC100_MAX_ROWS |
| echo "" |
|
|
| |
| echo "[2/7] mC4 Korean λ€μ΄λ‘λ..." |
| mkdir -p "$RAW_DIR/c4_ko" |
| python data/download.py \ |
| --dataset allenai/c4 \ |
| --subset ko \ |
| --split train \ |
| --text_col text \ |
| --output_dir "$RAW_DIR/c4_ko" \ |
| --shard_size 100000 \ |
| --max_rows $C4_MAX_ROWS |
| echo "" |
|
|
| |
| echo "[3/7] Namuwiki λ€μ΄λ‘λ..." |
| mkdir -p "$RAW_DIR/namuwiki_ko" |
| python data/download.py \ |
| --dataset heegyu/namuwiki-extracted \ |
| --text_col text \ |
| --output_dir "$RAW_DIR/namuwiki_ko" \ |
| --shard_size 100000 |
| echo "" |
|
|
| |
| echo "[4/7] SentencePiece Unigram ν ν¬λμ΄μ νμ΅ (vocab=$VOCAB_SIZE)..." |
| mkdir -p "$TOKENIZER_DIR" |
| |
| INPUT_FOR_SP="" |
| for dir in "$RAW_DIR/namuwiki_ko" "data/raw"; do |
| txts=$(find "$dir" -maxdepth 1 -name "*.txt" 2>/dev/null | head -20 | tr '\n' ',') |
| INPUT_FOR_SP="${INPUT_FOR_SP}${txts}" |
| done |
| INPUT_FOR_SP="${INPUT_FOR_SP%,}" |
|
|
| python tokenizer/train_sp_tokenizer.py \ |
| --input "$INPUT_FOR_SP" \ |
| --vocab_size $VOCAB_SIZE \ |
| --output_dir "$TOKENIZER_DIR" |
| echo "" |
|
|
| |
| echo "[5/7] SentencePiece β HuggingFace tokenizers.json λ³ν..." |
| python tokenizer/convert_sp_to_hf.py \ |
| --model "$TOKENIZER_DIR/tokenizer.model" \ |
| --output "$TOKENIZER_DIR/tokenizer.json" |
| echo "" |
|
|
| |
| echo "[6/7] λ°μ΄ν° ν ν¬λμ΄μ§..." |
|
|
| python data/prepare.py \ |
| --input "$RAW_DIR/cc100_ko/*.txt" \ |
| --output "$BIN_DIR/korean_cc100_train.bin" \ |
| --tokenizer "$TOKENIZER_DIR/tokenizer.json" \ |
| --val_split 0.002 \ |
| --seed 42 |
|
|
| python data/prepare.py \ |
| --input "$RAW_DIR/c4_ko/*.txt" \ |
| --output "$BIN_DIR/korean_c4_train.bin" \ |
| --tokenizer "$TOKENIZER_DIR/tokenizer.json" \ |
| --val_split 0.002 \ |
| --seed 43 |
|
|
| python data/prepare.py \ |
| --input "$RAW_DIR/namuwiki_ko/*.txt" \ |
| --output "$BIN_DIR/korean_namuwiki_train.bin" \ |
| --tokenizer "$TOKENIZER_DIR/tokenizer.json" \ |
| --val_split 0.002 \ |
| --seed 44 |
|
|
| echo "" |
|
|
| |
| echo "[7/7] νμ΅ λ°μ΄ν° λ³ν©..." |
|
|
| |
| TRAIN_BINS=$(ls "$BIN_DIR"/korean_*_train.bin 2>/dev/null | tr '\n' ' ') |
| if [ -n "$TRAIN_BINS" ]; then |
| python data/merge_bins.py $TRAIN_BINS "$BIN_DIR/korean_train.bin" |
| fi |
|
|
| |
| VAL_BINS=$(ls "$BIN_DIR"/korean_*_val.bin 2>/dev/null | tr '\n' ' ') |
| if [ -n "$VAL_BINS" ]; then |
| python data/merge_bins.py $VAL_BINS "$BIN_DIR/korean_val.bin" |
| fi |
|
|
| echo "" |
| echo "=== μλ£ ===" |
| echo "νμ΅ λ°μ΄ν°: $BIN_DIR/korean_train.bin" |
| echo "κ²μ¦ λ°μ΄ν°: $BIN_DIR/korean_val.bin" |
| echo "ν ν¬λμ΄μ : $TOKENIZER_DIR/tokenizer.json" |
| echo "" |
| echo "λ€μ λ¨κ³:" |
| echo " python3 -c \"" |
| echo " import numpy as np" |
| echo " d = np.memmap('$BIN_DIR/korean_train.bin', dtype='uint16', mode='r')" |
| echo " print(f'μ΄ ν ν°: {len(d):,}')" |
| echo " \"" |
|
|