snejs 님의 블로그

ASM 코드 분할 및 병합 코드

snejs — Thu, 16 Oct 2025 11:44:52 +0900

ASM 코드 분할 Pass

// CallSplitterPass.cpp
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Passes/PassPlugin.h"
#include "llvm/Passes/PassBuilder.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/IR/IntrinsicInst.h"

#include <filesystem>
#include <vector>
#include <string>
#include <fstream>

using namespace llvm;
namespace fs = std::filesystem;

static cl::opt<std::string> OutputDir("split-dir",
    cl::desc("Directory to place chunk files"),
    cl::init("splitted"));

static cl::opt<std::string> JsonOut("split-json",
    cl::desc("JSONL output file"),
    cl::init("chunks.jsonl"));

namespace {

// --- 유틸들 -----------------------------------------------------

static std::string getSafeModuleName(Module &M) {
    std::string moduleName = M.getSourceFileName();
    if (moduleName.empty()) {
        StringRef mid = M.getModuleIdentifier();
        moduleName = sys::path::stem(mid).str();
    }
    if (moduleName.empty())
        moduleName = "module";
    return moduleName;
}

static fs::path prepareModuleDir(const std::string &baseDirStr, const std::string &moduleName) {
    fs::path baseDir = baseDirStr;
    std::error_code ec;
    fs::create_directories(baseDir, ec);
    if (ec) {
        errs() << "Warning: cannot create dir '" << baseDir.string() << "': " << ec.message() << "\n";
    }

    fs::path moduleDir = baseDir / moduleName;
    fs::create_directories(moduleDir, ec);
    if (ec) {
        errs() << "Warning: cannot create module dir '" << moduleDir.string() << "': " << ec.message() << "\n";
    }

    return moduleDir;
}

static std::string instToString(const Instruction &I) {
    std::string s;
    raw_string_ostream rso(s);
    I.print(rso);
    rso.flush();
    return s;
}

static bool writeChunkFile(const fs::path &moduleDir, const std::string &chunkFilename,
                           const std::vector<std::string> &lines) {
    fs::path chunkPath = moduleDir / chunkFilename;
    std::error_code ec;
    raw_fd_ostream ofs(chunkPath.string(), ec, sys::fs::OF_Text);
    if (ec) {
        errs() << "Warning: cannot write chunk file " << chunkPath.string() << ": " << ec.message() << "\n";
        return false;
    }
    for (const auto &L : lines) ofs << L << "\n";
    return true;
}

static void writeChunkJson(std::ofstream &jsonl, const std::string &chunkName,
                           const std::set<std::string> &labels) {
    int li = 0;
    jsonl << "{\"id\":\"" << chunkName << "\",";
    jsonl << "\"labels\":[";
    for (std::string label : labels) {
        jsonl << "\"" << label << "\"";
        if (++li < labels.size()) jsonl << ",";
    }
    jsonl << "]}";
    jsonl << "\n";
}

static void flushChunk(std::ofstream &jsonl, const fs::path &moduleDir,
                       unsigned &chunk_id,
                       std::vector<std::string> &currentChunkLines,
                       std::set<std::string> &currentLabels) {
                        
    std::ostringstream oss;
    oss << std::setw(4) << std::setfill('0') << chunk_id;
    std::string chunkName = "chunk_" + oss.str();
    std::string chunkFilename = chunkName + ".s";
    writeChunkFile(moduleDir, chunkFilename, currentChunkLines);
    writeChunkJson(jsonl, chunkName, currentLabels);
    ++chunk_id;
    currentChunkLines.clear();
    currentLabels.clear();
}

// --- 함수별 처리 ----------------
static void processFunction(Function &F, std::ofstream &jsonl, const fs::path &moduleDir, unsigned &chunk_id) {
    std::vector<std::string> currentChunkLines;
    std::set<std::string> currentLabels;
    unsigned isInDispatch = 0;

    for (BasicBlock &BB : F) {
        for (Instruction &I : BB) {
            if (I.getParent() == &F.getEntryBlock() && &I == &*F.getEntryBlock().begin()) {
                currentLabels.insert("FUNC_ENTRY");
            }

            if (isInDispatch) {
                currentLabels.insert("DISPATCH_HANDLER"); // 라벨 추가
            } 
            else {
                currentLabels.insert("NO_DISPATCH"); // 라벨 추가
            }

            // 인스트럭션 문자열로 변환
            currentChunkLines.push_back(instToString(I));

            if (auto *CB = dyn_cast<CallBase>(&I)) {
                // 청크 저장 (라벨 포함)
                Function *calledFunc = CB->getCalledFunction();
                // 2. Dummy function 확인

                if (calledFunc && calledFunc->getName() == "dummy_function_succ") {
                    currentLabels.insert("DISPATCH_ENTRY"); // 라벨 추가
                    ++isInDispatch;
                }
                else if (calledFunc && calledFunc->getName() == "dummy_function_pred") {
                    currentLabels.insert("DISPATCH_EXIT"); // 라벨 추가
                    --isInDispatch;
                }
                else {
                    // currentLabels.insert("Call_BASE"); // 라벨 추가
                }
                
                flushChunk(jsonl, moduleDir, chunk_id, currentChunkLines, currentLabels);
            }
            else if (isa<BranchInst>(I) || isa<SwitchInst>(I) ||
                isa<IndirectBrInst>(I) || isa<ReturnInst>(I)) {
                // currentLabels.insert("BRANCH");
                flushChunk(jsonl, moduleDir, chunk_id, currentChunkLines, currentLabels);
            }

            // branch 기준 split
            /*
            else if (auto *BI = dyn_cast<BranchInst>(&I)) {
                currentLabels.insert("BRANCH");
                flushChunk(jsonl, moduleDir, chunk_id, currentChunkLines, currentLabels);
            }
            else if (auto *SI = dyn_cast<SwitchInst>(&I)) {
                currentLabels.insert("SWITCH");
                flushChunk(jsonl, moduleDir, chunk_id, currentChunkLines, currentLabels);
            }
            else if (auto *IBI = dyn_cast<IndirectBrInst>(&I)) {
                currentLabels.insert("INDIRECT_BRANCH");
                flushChunk(jsonl, moduleDir, chunk_id, currentChunkLines, currentLabels);
            }
            else if (auto *RI = dyn_cast<ReturnInst>(&I)) {
                currentLabels.insert("RETURN");
                flushChunk(jsonl, moduleDir, chunk_id, currentChunkLines, currentLabels);
            }
            */
        }
    }

    // 함수 끝에 남은 청크 저장
    if (!currentChunkLines.empty()) {
        flushChunk(jsonl, moduleDir, chunk_id, currentChunkLines, currentLabels);
    }
}

// --- Pass 구현 ------------------------------------------------
struct SplitterPass : public PassInfoMixin<SplitterPass> {
    PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM) {
        std::string moduleName = getSafeModuleName(M);
        fs::path moduleDir = prepareModuleDir(OutputDir.getValue(), moduleName);

        fs::path jsonPath = moduleDir / JsonOut.getValue();
        std::ofstream jsonl(jsonPath, std::ios::out | std::ios::trunc);
        if (!jsonl.is_open()) {
            errs() << "Error: cannot open JSONL file: " << JsonOut << "\n";
            return PreservedAnalyses::all();
        }

        unsigned chunk_id = 1;
        for (Function &F : M) {
            if (F.isDeclaration()) continue;
            processFunction(F, jsonl, moduleDir, chunk_id);
        }

        jsonl.close();
        errs() << "[SplitterPass] wrote " << chunk_id-1 << " chunks to directory '" << moduleDir.string()
               << "' and JSONL '" << JsonOut.getValue() << "'\n";

        return PreservedAnalyses::all();
    }
};

} // namespace

// plugin registration
extern "C" LLVM_ATTRIBUTE_WEAK ::llvm::PassPluginLibraryInfo llvmGetPassPluginInfo() {
    return {
        LLVM_PLUGIN_API_VERSION, "SplitterPass", LLVM_VERSION_STRING,
        [](PassBuilder &PB) {
            PB.registerPipelineParsingCallback(
                [](StringRef Name, ModulePassManager &MPM,
                   ArrayRef<PassBuilder::PipelineElement>) -> bool {
                    if (Name == "splitter") {
                        MPM.addPass(SplitterPass());
                        return true;
                    }
                    return false;
                });
        }
    };
}

주요 코드 :

호출(Call) 또는 분기(Branch, Switch, IndirectBranch, Return) 명령어를 만날 때마다 청크를 끊고 저장
함수 시작 시 "FUNC_ENTRY" 라벨 부여
"dummy_function_succ" 호출 시 "DISPATCH_ENTRY" 부여 및 디스패치 구간(DISPATCH_HANDLER) 진입
"dummy_function_pred" 호출 시 "DISPATCH_EXIT" 부여 및 디스패치 구간(DISPATCH_HANDLER) 종료
디스패치 구간 라벨은 "DISPATCH_HANDLER", 구간 외 라벨은 "NO_DISPATCH" 으로 부여

Chunk 합병 코드 1 (mergeChunk1.py)

import os
import json

# ===== 설정 =====
INPUT_DIR = "./splitted"
OUTPUT_DIR = "./merged1"
h = 20
MAX_SIZE = h * 1024  # 20KB

# ===== 폴더 선택 =====
subdirs = [d for d in os.listdir(INPUT_DIR) if os.path.isdir(os.path.join(INPUT_DIR, d))]

if not subdirs:
    print("❌ 'splitted' 폴더 안에 하위 폴더가 없습니다.")
    exit(1)

print("=== 병합 대상 코드 목록 ===")
for i, d in enumerate(subdirs, 1):
    print(f"{i}. {d}")

target = input("\n병합할 TARGET_DIR 이름 또는 번호 입력 (Enter 입력 시 전체 처리): ").strip()

# 전체 처리 여부
if target == "":
    targets = subdirs
    print("\n[!] 전체 폴더 자동 처리 모드 실행 중...\n")
else:
    if target.isdigit():
        idx = int(target) - 1
        if 0 <= idx < len(subdirs):
            targets = [subdirs[idx]]
        else:
            print(f"❌ 번호 '{target}'는 유효하지 않습니다.")
            exit(1)
    else:
        if target not in subdirs:
            print(f"❌ '{target}' 폴더를 찾을 수 없습니다. 경로를 확인하세요.")
            exit(1)
        targets = [target]


# ===== 병합 함수 정의 =====
def merge_target_code(TARGET_DIR):
    base_dir = os.path.join(INPUT_DIR, TARGET_DIR)
    output_dir = os.path.join(OUTPUT_DIR, TARGET_DIR)
    os.makedirs(output_dir, exist_ok=True)

    chunks = sorted(
        [f for f in os.listdir(base_dir) if f.endswith(".s")],
        key=lambda x: int(x.split("_")[1].split(".")[0])
    )

    jsonl_path = os.path.join(base_dir, "chunks.jsonl")
    if not os.path.exists(jsonl_path):
        print(f"⚠️ {TARGET_DIR}: chunks.jsonl 파일이 없습니다. 건너뜀.")
        return

    with open(jsonl_path, "r") as f:
        jsonl_entries = [json.loads(line) for line in f if line.strip()]

    if len(jsonl_entries) != len(chunks):
        raise ValueError(f"{TARGET_DIR}: .s 파일 개수와 jsonl 엔트리 수가 다릅니다!")

    merged_id = 1
    current_lines = []
    current_labels = set()
    current_size = 0
    merged_jsonl = []

    def flush_merged():
        nonlocal merged_id, current_lines, current_labels, current_size
        if not current_lines:
            return
        merged_name = f"chunk_{merged_id:04d}"
        merged_s_path = os.path.join(output_dir, f"{merged_name}.s")
        with open(merged_s_path, "w") as f:
            f.write("\n".join(current_lines))
        merged_jsonl.append({
            "id": merged_name,
            "labels": sorted(list(current_labels))
        })
        print(f"[+] {TARGET_DIR}: Saved {merged_name}.s ({current_size/1024:.1f} KB, labels={list(current_labels)})")
        merged_id += 1
        current_lines.clear()
        current_labels.clear()
        current_size = 0

    # ===== 병합 진행 =====
    for entry, s_file in zip(jsonl_entries, chunks):
        s_path = os.path.join(base_dir, s_file)
        with open(s_path, "r") as f:
            content = f.read()

        labels = entry.get("labels", [])
        size = len(content.encode())

        # DISPATCH 라벨 단독 처리
        if "DISPATCH_ENTRY" in labels or "DISPATCH_EXIT" in labels:
            flush_merged()
            merged_name = f"chunk_{merged_id:04d}"
            merged_s_path = os.path.join(output_dir, f"{merged_name}.s")
            with open(merged_s_path, "w") as f:
                f.write(content)
            merged_jsonl.append({
                "id": merged_name,
                "labels": sorted(labels)
            })
            print(f"[+] {TARGET_DIR} → {merged_name}.s (Dispatch 단독 처리)")
            merged_id += 1
            continue

        #   다음 chunk 추가 시 용량 초과 예측 → 미리 flush
        if current_size + size > MAX_SIZE:
            flush_merged()

        # 새 묶음에 현재 chunk 추가
        current_lines.append(content)
        current_labels.update(labels)
        current_size += size

    # 마지막 남은 조각 flush
    flush_merged()

    merged_jsonl_path = os.path.join(output_dir, "chunks.jsonl")
    with open(merged_jsonl_path, "w") as f:
        for entry in merged_jsonl:
            f.write(json.dumps(entry) + "\n")

    print(f"✅ {TARGET_DIR}: 병합 완료 — 총 {merged_id - 1}개 묶음 생성됨\n")


# ===== 전체/단일 폴더 실행 =====
for t in targets:
    merge_target_code(t)

print("  모든 병합 작업 완료!")

병합 규칙

이 코드는 splitted 폴더 내 각 타깃 폴더(TARGET_DIR)를 대상으로
.s(어셈블리 청크) 파일들을 chunks.jsonl 메타데이터 정보를 이용해 병합을 수행한다.
병합된 결과는 merged1/<TARGET_DIR> 폴더에 저장된다.

병합 로직

1. 용량 기준 (h KB)

h는 용량 기준을 나타내는 변수
MAX_SIZE = h * 1024 (h KB)
현재 병합 중인 청크의 크기(current_size)와 새로 추가할 청크의 크기의 합이 h KB를 초과하는지 검사
초과 시, 현재 병합 중이던 청크를 flush
문제점 : 입력이 애초의 h KB 를 넘는 크기의 파일일 경우 기준을 충족하지 못함

if current_size + size > MAX_SIZE: flush_merged()

2. DISPATCH 관련 라벨 단독 처리

현재 labels에 "DISPATCH_ENTRY" 또는 "DISPATCH_EXIT"이 포함되어 있을 시,
병합하지 않고, 현재까지의 chunk들을 flush

Virtualize asm 코드 분할 및 라벨링 자동화

snejs — Wed, 24 Sep 2025 20:24:48 +0900

코드 생성에는 GPT를 활용하였다.

문제 상황

Virtualize 난독화가 적용된 asm 코드를 입력으로 하여 코드를 분할하고 라벨링을 자동화 할 수 있는 코드 생성

가정

입력 : Virtualize 난독화가 적용된 asm 코드
출력 : 분할된 asm 코드 집합 + labeling된 데이터
요구사항 : "call"을 기준으로 코드 분할

위 상황에 대해 구체적인 문제에 대입 가능하도록 하는 코드 초안을 생성

사용된 원본 .c 코드는 약 4KB 코드를 사용하였으며, Virtualize 난독화를 적용한 후에는 약 695KB의 코드가 되었다.

Python Code 생성

직접 pass를 사용하고 생성하는 것을 바로 하기에는 힘들 것으로 보여 python code를 1차적으로 생성하고,

요구사항에 맞춰서 수정하였다.

python code 에 사용된 입력은 clang -S 으로 컴파일된 .s 파일을 사용하였다.

정규식을 이용하여 단순 "call" 찾아 코드를 분할 한 뒤, .jsonl에 id와 labeling 등의 정보를 저장할 수 있게 했다.

import re
import json
import os

def split_asm_file(input_file, output_file):
    # 폴더 & 기본 이름 준비
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    folder = os.path.join(os.path.dirname(output_file) or ".", base_name)
    os.makedirs(folder, exist_ok=True)

    with open(input_file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    chunks_meta = []
    current_chunk = []
    chunk_id = 0

    for line in lines:
        line_strip = line.strip()
        if not line_strip:
            continue  # 빈 줄 무시

        current_chunk.append(line_strip)

        # 함수 시작 라벨 추출(.globl, <func>: 같은 부분)
        if re.match(r"^[a-zA-Z0-9_]+:$", line_strip):
            labels = ["FUNC_ENTRY"]
        else:
            labels = []

        # call 명령어 감지
        if re.search(r"\bcall\b", line_strip):
            labels.append("CALL_SITE")

            # 청크 id 및 파일명 생성
            chunk_name = f"{base_name}_chunk_{chunk_id}"
            chunk_filename = chunk_name + ".s"
            chunk_path = os.path.join(folder, chunk_filename)

            # 청크 내용 파일로 저장 (각 라인 뒤에 newline 추가)
            with open(chunk_path, "w", encoding="utf-8") as cf:
                cf.write("\n".join(current_chunk) + "\n")

            # 메타 정보만 JSONL에 쓸 리스트에 추가 (asm 필드 없음)
            chunks_meta.append({
                "id": chunk_name,
                "labels": labels,
                "file": os.path.relpath(chunk_path)  # 상대경로로 저장 
            })

            chunk_id += 1
            current_chunk = []  # 새 청크 시작

    # 마지막 청크가 남아있다면 파일로 저장하고 메타에 추가
    if current_chunk:
        chunk_name = f"{base_name}_chunk_{chunk_id}"
        chunk_filename = chunk_name + ".s"
        chunk_path = os.path.join(folder, chunk_filename)

        with open(chunk_path, "w", encoding="utf-8") as cf:
            cf.write("\n".join(current_chunk) + "\n")

        chunks_meta.append({
            "id": chunk_name,
            "labels": [],
            "file": os.path.relpath(chunk_path)
        })

    # JSONL 저장 (asm 필드 없음)
    with open(output_file, "w", encoding="utf-8") as f:
        for meta in chunks_meta:
            f.write(json.dumps(meta, ensure_ascii=False) + "\n")

    print(f"[+] Saved {len(chunks_meta)} chunks to {output_file}")
    print(f"[+] Chunk files are in folder: {folder}")

if __name__ == "__main__":
    input_file = "test_out.s"  # 분석할 asm 파일
    output_file = "test_out.jsonl"  # 결과 저장할 JSONL 파일

    split_asm_file(input_file, output_file)

결과 예시

생성된 파일은 약 1KB ~ 300KB 의 파일들이 41개 생성되었다.

C++ Code 생성

위에서 생성한 python 코드를 바탕으로 LLVM Pass 를 활용한 C++ 코드를 생성하도록 했다.

python에서 불필요한 정보들을 제거하고, 하나의 코드에서 분할된 코드 집합과 jsonl 파일을 한 파일에 생성되도록 수정하였다.

// CallSplitterPass.cpp
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Passes/PassPlugin.h"
#include "llvm/Passes/PassBuilder.h" 
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/raw_ostream.h"

#include <filesystem>   
#include <vector>
#include <string>
#include <fstream>

using namespace llvm;
namespace fs = std::filesystem;

static cl::opt<std::string> OutputDir("split-asm-dir",
    cl::desc("Directory to place chunk files"),
    cl::init("split_asm_out"));

static cl::opt<std::string> JsonOut("split-asm-json",
    cl::desc("JSONL output file"),
    cl::init("chunks.jsonl"));

namespace {

struct CallSplitterPass : public PassInfoMixin<CallSplitterPass> {
    PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM) {
        // 1) 모듈 이름 가져오기
        std::string moduleName = M.getSourceFileName();
        if (moduleName.empty()) {
            StringRef mid = M.getModuleIdentifier();
            moduleName = sys::path::stem(mid).str(); // stem(StringRef) -> StringRef, .str()로 std::string으로 변환
        }
        if (moduleName.empty())
            moduleName = "module";

        // 2) 출력 디렉토리
        fs::path baseDir = OutputDir.getValue();
        std::error_code ec_fs;
        
        fs::create_directories(baseDir, ec_fs);
        if (ec_fs) {
            errs() << "Warning: cannot create dir '" << baseDir.string() << "': " << ec_fs.message() << "\n";
        }

        fs::path moduleDir = baseDir / moduleName;

        fs::create_directories(moduleDir, ec_fs);
        if (ec_fs) {
            errs() << "Warning: cannot create module dir '" << moduleDir.string() << "': " << ec_fs.message() << "\n";
        }

        fs::path jsonPath = moduleDir / JsonOut.getValue();
        // 3) JSONL 파일 (덮어쓰기)
        std::ofstream jsonl(jsonPath, std::ios::out | std::ios::trunc);
        if (!jsonl.is_open()) {
            errs() << "Error: cannot open JSONL file: " << JsonOut << "\n";
            return PreservedAnalyses::all();
        }

        unsigned chunk_id = 0;

        for (Function &F : M) {
            if (F.isDeclaration()) continue;

            std::vector<std::string> currentChunkLines;
            std::vector<std::string> currentLabels;
            bool seenAnyInstInFunction = false;

            for (BasicBlock &BB : F) {
                for (Instruction &I : BB) {
                  // 라벨 추가
                    if (!seenAnyInstInFunction) {
                        currentLabels.push_back("FUNC_ENTRY");
                        seenAnyInstInFunction = true;
                    }

                    // instruction -> 문자열
                    std::string instStr;
                    raw_string_ostream rso(instStr);
                    I.print(rso);
                    rso.flush();
                    currentChunkLines.push_back(instStr);

                    // Call 계열이면 청크 저장
                    if (isa<CallBase>(&I)) {
                        currentLabels.push_back("CALL_SITE");

                        std::string chunkName = "chunk_" + std::to_string(chunk_id);
                        std::string chunkFilename = chunkName + ".s";
                        fs::path chunkPath = moduleDir / chunkFilename;

                        // 파일 쓰기 (llvm raw_fd_ostream 사용)
                        std::error_code ec;
                        raw_fd_ostream ofs(chunkPath.string(), ec, sys::fs::OF_Text);
                        if (ec) {
                            errs() << "Warning: cannot write chunk file " << chunkPath.string() << ": " << ec.message() << "\n";
                        } else {
                            for (auto &L : currentChunkLines) ofs << L << "\n";
                        }

                        // JSONL 기록 
                        jsonl << "{\"id\":\"" << chunkName << "\",";
                        jsonl << "\"labels\":[";
                        for (size_t li = 0; li < currentLabels.size(); ++li) {
                            jsonl << "\"" << currentLabels[li] << "\"";
                            if (li + 1 < currentLabels.size()) jsonl << ",";
                        }
                        jsonl << "]";
                        jsonl << "}\n";

                        ++chunk_id;
                        currentChunkLines.clear();
                        currentLabels.clear();
                        seenAnyInstInFunction = false; // 다음 청크는 새로운 시작
                    }
                }
            }

            // 함수 끝에 남은 청크 저장
            if (!currentChunkLines.empty()) {
                std::string chunkName = "chunk_" + std::to_string(chunk_id);
                std::string chunkFilename = chunkName + ".s";
                fs::path chunkPath = moduleDir / chunkFilename;

                std::error_code ec;
                raw_fd_ostream ofs(chunkPath.string(), ec, sys::fs::OF_Text);
                if (ec) {
                    errs() << "Warning: cannot write chunk file " << chunkPath.string() << ": " << ec.message() << "\n";
                } else {
                    for (auto &L : currentChunkLines) ofs << L << "\n";
                }

                jsonl << "{\"id\":\"" << chunkName << "\",";
                jsonl << "\"labels\":[";
                for (size_t li = 0; li < currentLabels.size(); ++li) {
                    jsonl << "\"" << currentLabels[li] << "\"";
                    if (li + 1 < currentLabels.size()) jsonl << ",";
                }
                jsonl << "]";
                jsonl << "}\n";

                ++chunk_id;
            }
        }

        jsonl.close();
        errs() << "[CallSplitterPass] wrote " << chunk_id << " chunks to directory '" << moduleDir.string()
               << "' and JSONL '" << JsonOut << "'\n";

        return PreservedAnalyses::all();
    }
};

} // namespace

// plugin registration
extern "C" LLVM_ATTRIBUTE_WEAK ::llvm::PassPluginLibraryInfo llvmGetPassPluginInfo() {
    return {
        LLVM_PLUGIN_API_VERSION, "CallSplitterPass", LLVM_VERSION_STRING,
        [](PassBuilder &PB) {
            PB.registerPipelineParsingCallback(
                [](StringRef Name, ModulePassManager &MPM,
                   ArrayRef<PassBuilder::PipelineElement>) -> bool {
                    if (Name == "call-splitter") {
                        MPM.addPass(CallSplitterPass());
                        return true;
                    }
                    return false;
                });
        }
    };
}

사용한 컴파일 코드

clang++ -std=c++17 -fPIC -shared CallSplitterPass.cpp -o CallSplitterPass.so \
  `llvm-config --cxxflags`

clang -O1 -emit-llvm -c -Xclang -disable-O0-optnone -o test_out.bc test_out.c

opt -load-pass-plugin=./CallSplitterPass.so -passes="call-splitter" \
    -split-asm-dir=split_asm_out -split-asm-json=chunks.jsonl test_out.bc -o /dev/null

결과 예시

생성된 파일은 약 1KB ~ 40KB의 asm 파일들이 177개 생성되었다.

정리하며

정규화 식을 사용하여 단순 call을 기준으로 분할한 python 코드에 비해 llvm pass를 활용하여 분할한 코드가 훨씬 균일하게 분할되는 것을 확인할 수 있었다.

또한, llvm pass 를 활용하여 분할한 코드에서 tail call이 연속적으로 호출되어 1줄짜리 코드로 분할되어 다수 생성되는 것이 확인됐다.

전체적으로 단순한 동작을 구현하고 있으나, 후일 추가적인 요구사항에 따라 수정할 예정이다.

AI를 활용한 LLVM 적용 아이디어 생성

snejs — Tue, 19 Aug 2025 10:40:01 +0900

목표

LLVM에 대한 기초적인 PASS 생성 정도의 지식 수준에서 현재 진행 중인 프로젝트에 적용 방법을 고안하는 것에 한계를 느끼고, 구글링을 통해 정보를 얻어내는 방법도 있으나, AI를 활용함으로써 더 양질의 정보를 빠르게 얻어보고자 한다.

2025 AI 여름방학 특강에서 배운 프롬프트 작성 방법을 활용해 AI에게 아이디어를 생성시켜 프로젝트의 진행 방향을 모색할 수 있도록 한다.

대상

Chat GPT-5-thinking 과 Perplexity AI에서 쿼리에 따른 최적 모델을 사용하도록 설정하여 생성시켰다.

Chat GPT 는 가장 보편적인 생성형 AI이고, Perplexity AI는 정보 검색에 뛰어난 성능을 보인다는 것을 사유로 두 AI를 선정하였다.

프롬프트

다음 프롬프트는 특강 중 AI 활용 마스터 부분을 참고하여 작성한 프롬프트이다.

[명령] llvm을 '바이너리를 이용한 제어구조 판별 도구(llm) 개발' 에 적용할 수 있는 방법을 제시해줘.
[맥락] 이 프로젝트의 배경은 요즘 주로 악성코드의 난독화 과정에서 사용되는 virtualize라는 난독화가 이루어지는 코드에서 fetch-decode-execute 형식으로 많이 변형되곤 하는데, 이 형식이 loop-switch 구조와 닮은 것을 기반으로 이와 같은 난독화가 적용되었는지 여부를 탐지해내고자 하는 것이야. llm이 난독화된 코드의 바이너리 코드를 보고 난독화를 효율적으로 탐지해낼 수 있도록 정보를 간략화하거나 주요한 정보만을 바탕으로 llm이 판별할 수 있도록 하기 위해 llvm을 적용하고자 해.
[제한] 이는 프로젝트에 접목하고자 하는 아이디어를 제시하는 것으로 3문장을 넘어가지 않는 설명과 함께 아이디어를 제시해줘. 아이디어는 10가지 정도 제시해줘. 종류는 기존을 방법을 바탕으로 7가지 창의적인 방법으로 3가지 제시해줘
[예시] llvm은 http://www.llvm.org을 참고해서 어떤 방법을 적용할 수 있는지 탐색할 수 있을 것이고, 아이디어는 이전까지 다른 프로젝트에 llvm에 어떻게 적용되었는지 논문 탐색을 바탕으로 어떤 방식으로 활용되었는지 를 참고하면 좋을 거 같아.

Chat GPT-5-thinking의 응답

아래 10가지는 “바이너리를 이용한 제어구조 판별 도구(LLM)”에 LLVM을 접목해, 가상화(virtualize) 난독화가 만들어내는 fetch–decode–execute(= loop–switch 류) 패턴을 LLM이 쉽게 판별하도록 특징을 간추려 주는 방법입니다. 각 항목은 3문장 이내로 요약했습니다.

기존 방법 기반 (7)

MC/objdump 기반 CFG·점프테이블 복원
llvm-objdump와 LLVM MC 디스어셈블러를 써서 간접 분기·점프테이블·핸들러 블록을 복원하고, BB(기본블록) 단위 통계(간접분기 밀도, 분기 대상 다양도, 핸들러 길이 분산)를 LLM 입력용 특징으로 내보냅니다. 가상화 VM 인터프리터의 switch/간접분기 집중 패턴을 정량화해 태깅합니다. (llvm.org, The LLVM Project Blog)
Object 라이브러리로 섹션·심볼·재배치 메타데이터 추출
llvm::object로 바이너리 포맷별(ELF/Mach-O/COFF) 코드/데이터/읽기전용 섹션과 재배치·심볼 정보를 읽어, “비정형 바이트코드 테이블 + 작은 핸들러 군집” 같은 VM 흔적을 규칙化합니다. 이 메타를 LLM 프롬프트의 헤더로 붙여 잡음 없이 맥락을 제공합니다. (llvm.org)
BOLT로 핫 트레이스·CFG 가중치 산출
perf2bolt→llvm-bolt 파이프라인으로 실제 실행 프로파일을 반영한 CFG(간접분기 허브가 과열되는 인터프리터 루프)와 핸들러 진입/퇴출의 핫에지 비율을 뽑아 특징화합니다. 해석기형 루프의 “짧은 핸들러 다중호출 + 간접분기 허브” 지표를 만듭니다. (GitHub, llvm.org, research.facebook.com)
llvm-mca로 디스패치 마이크로패턴 분석
디스어셈블 스니펫을 llvm-mca에 넣어, 반복 루프 내 로드·분기·비산 수치(throughput/리소스 사용) 패턴을 얻고 해석기 루프 특유의 ‘얇은 연산+간접분기’ 지표를 생성합니다. 성능 시뮬레이션 수치 자체를 LLM이 읽기 쉬운 스칼라 특징으로 축약합니다. (llvm.org, en.algorithmica.org, Johnny's Software Lab)
XRay(정적/동적)로 간접 분기 집약 트레이스 채집
가능하면 XRay로 함수 경계/콜스택 트레이스를 수집하고, 짧은 처리기(handler) 호출이 고빈도로 반복되는 패턴을 정리합니다. 정적 특징과 결합해 “동적 호출-길이 분포”를 LLM에 제공합니다. (llvm.org)
MIR/MachineFunctionPass 기반 디코딩 후 규격화
MC로 올린 기계명령을 MIR 수준의 그래프로 정리하고, MachineFunctionPass로 “인터프리터 루프 후보”를 표준 형태(머리 루프+간접 분기+핸들러 군집)로 정규화해 델타를 줄입니다. LLM에는 원본 대신 ‘정규화된 CFG 서명 + 통계’만 전달합니다. (llvm.org)
MLIR 데이터플로 분석으로 바이트코드 상태추적
바이트코드 포인터/스택 레지스터처럼 보이는 상태변수를 MLIR 데이터플로 프레임워크로 추적해, “fetch(메모리 인덱싱)→decode(테이블 참조)→execute(핸들러 분기)”의 흐름을 태그합니다. LLM은 이 태그열(상태머신 로그)을 근거로 판별만 하면 됩니다. (mlir.llvm.org)

창의적 방법 (3)

“VM 지문(fingerprint)” 벡터화 & LLM 라벨링
Tigress 등 공개 VM 난독화로 합성 바이너리를 다량 생성해, 각 바이너리에서 위 지표(간접분기 엔트로피, 핸들러 길이/카디널리티, 점프테이블 산포, mca 지표 등)를 벡터화하고 LLM에 few-shot 예시로 첨부합니다. 실환경 시료는 지도 없이도 지문 유사도로 스스로 판별하게 합니다. (tigress.cs.arizona.edu, tigress.wtf)
“루프–스위치 모티프 서명” 생성 규칙
가상화 연구(디버추얼라이제이션 문헌)에서 보고된 인터프리터 구조를 규칙화해, CFG 패턴을 해시로 압축(예: [헤드 루프 1개 + 핸들러 군집 k개 + 간접분기 허브 out-degree d] 형태)합니다. LLM에는 해시와 소수 지표만 주어 ‘해시 매칭+설명’ 답변을 유도합니다. (www2.cs.arizona.edu, cs.gmu.edu, ScienceDirect)
“설명가능 프롬프트” 포맷 표준화
LLM 입력을 “정규화 CFG(요약) + 통계표 + 동적 힌트 + 예시 템플릿”의 고정 포맷으로 만들고, 각 필드에 LLVM 도구가 산출한 근거(툴/옵션/버전)를 함께 첨부합니다. 판별 정확도뿐 아니라 근거 회수(Explainability)와 재현성을 동시에 확보합니다. (llvm.org)

참고: LLVM은 바이너리 핸들링에 MC/objdump/Object 계층, 실행 프로파일과 레이아웃에 BOLT, 정적 성능 분석에 llvm-mca, 동적 트레이싱에 XRay, 분석·정규화에 MIR/MLIR 등 폭넓은 도구군을 제공합니다. 위 조합은 “난독화 코드 → (LLVM로) 정규화된 특징/서명 → (LLM로) 최종 판별”의 2단 구성으로 설계하는 것을 전제로 합니다. (The LLVM Project Blog, llvm.org, GitHub, mlir.llvm.org)

Perplexity AI의 응답

LLVM을 '바이너리를 이용한 제어구조 판별 도구(llm) 개발'에 적용할 수 있는 10가지 아이디어

LLVM IR로 제어흐름 포맷 변환: 바이너리를 LLVM IR로 리프팅해 아키텍처 독립적이고 난독화에도 강인한 제어흐름 그래프(CFG)를 생성, loop-switch 변형 탐지에 활용.
기본 블록 간 분기 패턴 추출: LLVM의 기본 블록(block) 및 분기 명령(branch, switch 등) 분포를 수치화해 난독화 특징(예: loop-switch 집중도)으로 활용.
Loop-Switch 특이 구조 정량화: LLVM IR에서 반복문(loops)과 switch-case 분기 사용 정도를 정량적으로 분석, fetch-detect-execute 패턴의 비정상적 빈도 탐지.
바이너리의 함수 호출 그래프(FCG) 추출: 바이너리를 LLVM으로 변환한 뒤 함수 호출 그래프를 자동 생성·시각화해, 전형적인 가상머신 기반 난독화 패턴(짧은 dispatcher, 비정상적 호출 깊이 등) 탐지에 활용.patents.google
데이터 흐름 분석 및 변수 추적: LLVM IR에서 주요 변수 및 피연산자의 흐름을 추적, fetch-detect-execute 패턴이 반복되는 구조 탐지.
패턴 매칭 기반 자동 탐지 룰 개발: LLVM IR을 패턴 벡터로 변환 후, 기존 난독화된 코드에서 추출한 패턴과 비교하여 유사도 검출.
IR 이미지 변환 + 딥러닝 모델 적용: LLVM IR을 이미지로 변환, CNN 등 딥러닝 모델에 입력시켜 바이너리의 난독화 여부 자동 판별.foxforensic.tistory+1

창의적인 추가 아이디어

LLVM Pass로 실시간 난독화 패턴 탐지: 직접 LLVM Pass를 제작해 IR 생성 단계에서 loop-switch, dispatcher 등 의심 패턴을 실시간 탐지·로그화.
IR 코드 특징 기반 anomaly score 도출: LLVM IR에서 각 제어구조, 분기, 루프의 빈도와 네트워크 특성을 수치화한 뒤 통계적 이상치 탐지로 난독화 여부를 평가.
자연어로 해석된 IR 기반 LLM fine-tuning: 주요 LLVM IR 패턴을 자연어 설명(코드 코멘트 등)으로 LLM에 직접 식별 정보를 제공, LLM의 판별력을 강화.

각 방법은 LLVM의 범용성(언어/아키텍처 독립성), 코드 패턴의 정량적 분석, ML/AI와의 융합까지 다양한 방식으로 활용할 수 있습니다.kci+2

출처 :

추가 질문

- 프롬프트 : 기존 연구에서 LLVM이 제어 흐름 조작을 탐지하는 데 어떻게 활용됐는지 사례를 기반으로 제안

- 응답 :

LLVM 기반의 제어 흐름 조작 탐지 활용 사례는 다음과 같이 제안할 수 있습니다.

Control Flow Flattening 난독화 탐지
LLVM 패스(Transform Pass)를 통해 의도적으로 제어흐름을 switch-case 및 dispatcher로 평면화하는 난독화 패턴(Control Flow Flattening)에 대한 전후의 제어 흐름 그래프(CFG) 변화를 분석함으로써, 비정상적으로 변형된 코드(난독화 본문 진입 이후 모든 블록이 한 지점으로 돌아가는 구조) 탐지 사례가 있습니다.ufo.stealien
LLVM IR 기반 정적 분석
바이너리 또는 프로그램을 LLVM IR로 리프팅한 후, 정적 분석을 통해 분기문, 루프, 함수 호출의 제어흐름을 추출·정량화하여, 자주 사용되는 분기 구조 등 비정상적인 흐름의 패턴(특정 블록에서 반복적으로 분기)를 감지하는 기법이 검증된 바 있습니다.koreascience+1
이미지 변환 + 머신러닝 모델 적용 사례
난독화 및 조작이 적용된 바이너리를 LLVM IR로 디컴파일한 뒤, IR 코드를 이미지로 변환하고 CNN 기반 머신러닝 모델에 학습시켜 제어 흐름 변형 및 난독화 적용 여부를 판별하는 연구가 제시되었습니다.kci

이와 같이, LLVM은 IR 변환을 활용한 제어 흐름 그래프 분석, 커스텀 탐지 패스 개발, ML 기반 자동 탐지 등 다양한 방식으로 제어 흐름 조작 탐지에 활용되고 있습니다.

아이디어 종합 정리

- 다양한 방법을 통한 패턴 통계 및 정량화 방법 등의 수치적 방법의 사용

- CFG 나 FCG 추출하여 탐지에 활용

- 실제 Flattening 전후 CFG 변화를 분석, 모든 블록이 한 지점으로 돌아가는 구조를 탐지한 사례

- 바이너리를 IR로 디컴파일 한 후 IR 코드를 이미지 변환한 후 CNN 기반 머신러닝 모델에 학습, 제어 흐름 변형 및 난독화 여부 판별 연구

AI 별 응답 경향 차이

Chat GPT-5-thinking

- 영어권 논문 및 블로그를 참고

- 보다 전문적이며, 난이도가 있는 내용을 소개

- 각 아이디어 당 1개 이상의 출처를 제공

Perplexity AI

- 한국 논문 및 블로그를 참고

- 기존 틀에서 벗어나서 다양한 시도의 연구를 소개

- 요청 사항에 비 간단한 내용 설명

[LLVM tutorial] 10

snejs — Tue, 22 Jul 2025 13:01:38 +0900

LLVM tutorial

LLVM tutorial 10장을 바탕으로 정리했다.

https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl10.html

10. Kaleidoscope: Conclusion and other useful LLVM tidbits — LLVM 22.0.0git documentation

We have a couple of common questions about code in the LLVM IR form - let’s just get these out of the way right now, shall we? Kaleidoscope is an example of a “portable language”: any program written in Kaleidoscope will work the same way on any targ

llvm.org

LLVM tutorial 에서 다룬 내용을 정리하고, Kaleidoscope 언어와 LLVM IR 에 대한 특징을 알려준다.

10.1. Tutorial Conclusion

1장에서 9장까지의 LLVM tutorial을 통해 Kaleidoscope 언어의 구조를 확장하는 과정을 배우며, LLVM에 대한 이해를 넓힐 수 있었다. 필자는 앞선 내용을 정리하며, 다음과 같은 내용을 설명한다.

Kaleidoscope 언어를 통해 만들어 낸 것

Lexer
Parser
AST
Code generator
Interactive run-loop(대화형 실행 루프) 와 JIT
Emitted debug information in standalone executables (독립 실행형 실행 파일로 디버그 정보 출력)

Kaleidoscope 언어가 지원하는 기능

사용자 정의 이항 및 단항 연산자
즉각적인 평가를 위한 JIT 컴파일
SSA 구조를 갖춘 몇 가지의 제어 흐름 구조

Kaleidoscope 언어에 추가할 수 있는 기능

Global variables
- 값 조회 후 local symbold table에 없으면 global symbol table 확인 후 accept/reject
Typed variables
- 변수 정의 시 type 지정 추가
- 변수의 타입을 symbol table에 함께 기록
Arrays, structs, vectors, etc
- Array FAQ : https://llvm.org/docs/GetElementPtr.html
- Syntax

<result> = getelementptr <ty>, ptr <ptrval>{, <ty> <idx>}*
<result> = getelementptr inbounds <ty>, ptr <ptrval>{, <ty> <idx>}*
<result> = getelementptr nusw <ty>, ptr <ptrval>{, <ty> <idx>}*
<result> = getelementptr nuw <ty>, ptr <ptrval>{, <ty> <idx>}*
<result> = getelementptr inrange(S,E) <ty>, ptr <ptrval>{, <ty> <idx>}*
<result> = getelementptr <ty>, <N x ptr> <ptrval>, <vector index type> <idx>

Standard runtime
- 'printd'와 'putchard'를 사용하는 등 사용자가 외부 함수에 접근할 수 있도록 할 때 runtime에 대한 호출로 해결할 수 있음
Memory management
- stack 접근에서 malloc/free, heap 등 추가
- heap memory 할당 과정에서 사용되는 garbage callector의 지원 : https://llvm.org/docs/GarbageCollection.html
Exception handling support
Object orientation, generics, database access, complex numbers, geometric programming
Unusual domains

10.2. Properties of the LLVM IR

10.2.1. Target Independence (타깃 종속성)

Kaleidoscope laguage는 portable language(이식 가능한 언어)이다.

portable language 란?

어느 특정 환경에 구애받지 않고, 코드를 수정하지 않아도 다른 환경에서 실행될 수 있는 프로그래밍 언어

LLVM은 IR에서 Target independence를 유지하는 경향이 있으며, C 코드를 생성하여 LLVM이 지원하지않는 타겟에서 컴파일 할 수도 있다.

C 언어는 이식성이 좋다?

이식성을 가진 언어의 대표격으로 C 혹은 C 계열의 언어들을 많이 떠올릴 수 있으나, 일반적으로 이식성이 좋은 편은 아니다.

C 언어의 문제점은 특정 대상에 대한 가정이 너무 많다는 것 이다.

#ifdef __i386__
  int X = 1;
#else
  int X = 42;
#endif

위에서 나오는 __i386__ 은 intel 32bit 아키텍처에서 C/C++ 컴파일러가 자동으로 정의하는 매크로 중 하나로, 타깃 아키텍처가 달라지면 실행되는 코드가 달라질 수 있다.

위와 같은 방식은 타깃에 맞게 지속적인 수정을 요구하기 때문에 유지보수가 어렵고 에러가 생기기 쉽다.

이런 문제 속에서 C 언어로도 이식 가능한 코드를 만들 수 있는데, 이는 여러가지 제약을 요구한다.

고정된 크기의 원시 타입 사용 (예시 : 32bit int, 64bit long)

typedef int32_t my_int;
typedef int64_t my_long;

기존 바이너리들과 ABI(application binary interface) 호환성을 고려하지 않고, 순수 코드 차원에서만 코딩
일부 플랫폼에 종속되는 전처리기 매크로, 비표준 확등 등과 같은 부가적인 기능의 포기

이와 같은 접근은 in-kernel language domain에서 유효하게 사용될 수 있는데, 커널 같은 환경에서는 특정 플랫폼에만 맞춰진 작은 언어 집합만을 사용하는게 일반적이기 때문이다.

여기서 작은 언어 집합만을 사용한다는 것은 malloc, printf, 파일 I/O 등은 사용하지 않고 포인터, 정수 연산, 고정된 자료형, 조건문, 루프 정도만 사용하는 것을 말한다.

10.2.2. Safety Guarantees

언어의 안전성을 보장하기 위해서는 다음의 조합이 요구된다.

Laguage design
Runtime support
Operating system support

LLVM에서 안전한 언어를 구현하는 것은 가능하나, LLVM IR 자체가 안전성을 보장하는 것은 아니다.

LLVM IR은 안전하지 않은 pointer cast, use-after-free 버그, buffer over-runs 등의 다양한 문제를 야기할 수 있다.

10.2.3. Languge-Specific Optimizations

LLVM에는 몇 가지 염두에 두어야할 것이 있는데,

LLVM은 정보를 잃을 수 있음
- LLVM type system이 'name equivalence' 대신 'structural equivalence'를 사용해, 동일한 구조를 가진 두 type에 대해 동일한 구조로 컴파일 하여 어디서 왔는지 알 수 없는 문제
- 예를 들어, LLVM IR에서 SSA 값이 ILP32 머신의 C 'int'에서 온 건지, C 'long'에서 온 건지 디버그 정보가 없다면 둘 다 i32 값으로 컴파일 되어 구별할 방법이 없음
LLVM이 정보 손실이 잃어날 수 있지만, 고정적으로 발생하는 것이 아님
언어별 최적화를 추가하는 것은 쉽지만, 그 방법에는 여러 선택지가 존재

10.3. Tips and Tricks

10.3.1. Implementing portable offsetof/sizeof

LLVM에서 sizeof, offsetof와 variable-sized struct (가변 크기 구조체)를 이식성 있게 구현하는 방법을 설명하는 기술 노트를 소개한다.

문제

C/C++의 sizeof와 offsetof는 컴파일 타임에 평가되지만, LLVM은 IR에서 이를 직접 지원하는 명령어가 없음

해결책

getelementptr의 명령어 사용
getelementptr는 포인터 연산을 수행하는 LLVM 명령어
null 포인터에서 오프셋을 계산해서 크기나 위치를 유도

//===----------------------------------------------------------------------===//
// Implementing Portable sizeof, offsetof and Variable Sized Structures in LLVM
//===----------------------------------------------------------------------===//

June 24, 2005 - Initial Revision

This note describes how an LLVM language front-end can generate portable code
when the sizes of a type or the offset of a field in a structure are needed.
These sizes and offsets are target-specific, and can depend on the alignment,
structure padding and pointer size used by a target.  It is often useful to be
able to defer evaluation of these values until code generation time, for use
when generating LLVM code that is intended to be portable to multiple targets.

To illustrate these concepts, we show how they can be used to implement
efficient variable sized arrays in LLVM (e.g. as used for 'pascal strings').

//===----------------------------------------------------------------------===//
// Using getelementptr for sizeof and offsetof
//

Though LLVM does not contain a special purpose sizeof/offsetof instruction, the
getelementptr instruction can be used to evaluate these values.  The basic idea
is to use getelementptr from the null pointer to compute the value as desired.
Because getelementptr produces the value as a pointer, the result is casted to
an integer before use.

For example, to get the size of some type, %T, we would use something like
this:

  %Size = getelementptr %T* null, int 1
  %SizeI = cast %T* %Size to uint

This code is effectively pretending that there is an array of T elements,
starting at the null pointer.  This gets a pointer to the 2nd T element
(element #1) in the array and treats it as an integer.  This computes the 
size of one T element.

To get the offset of some field in a structure, a similar trick is used.  For
example, to get the address of the 2nd element (element #1) of { sbyte, int* } 
(which depends on the target alignment requirement for pointers), something 
like this should be used:

  %Offset = getelementptr {sbyte,int*}* null, int 0, uint 1
  %OffsetI = cast int** %Offset to uint

This works the same way as the sizeof trick: we pretend there is an instance of
the type at the null pointer and get the address of the field we are interested
in.  This address is the offset of the field.

Note that in both of these cases, the expression will be evaluated to a
constant at code generation time, so there is no runtime overhead to using this
technique.

//===----------------------------------------------------------------------===//
// Using sizeof/offsetof to allocate memory & variable sized structures
//

One time that these techniques are useful is when allocating memory for a
structure with malloc or a garbage collector.  These routines need to know the
actual size of an object.  This is complicated by the fact that some langauges
allow for variable sized structures: structures which end with an array whose
length is determined at allocation time.  This sort of construct is used for
many sorts of language constructs and some languages even allow programmers to
write these explicitly (e.g. C99).

LLVM explicitly supports variable sized arrays through the use of zero length
arrays.  For example, the structure type "{ uint, [0 x float] }" can be used to
define an array of floats of variable size.  Presumably the actual number of
elements is stored in the initial uint element.

In order to dynamically allocate an instance of this data type, code the the
following can be used (which uses the offsetof technique described above):

%array = type { uint, [0 x float] }
implementation
%array *%NewArray(uint %Length) {
  ;; Get the offset of the 'Length' element of the array from the null
  ;; pointer.
  %Size = getelementptr %array* null, int 0, uint 1, uint %Length
  %SizeU = cast float* %Size to uint
  %Ptr = malloc sbyte, uint %SizeU
  %Result = cast sbyte* %Ptr to %array*

  ;; Remember the allocated length for later use.
  %SizePtr = getelementptr %array* %Result, int 0, uint 0
  store uint %Length, uint* %SizePtr
  ret %array* %Result
}

There are many other potential uses of these techniques.

10.3.2. Garbage Collected Stack Frames

LLVM에서 명시적으로 관리되는 스택 프레임을 구현하는 접근 방식을 설명하는 기술 노트를 소개한다.

여기서 스택 프레임을 명시적으로 관리하고자하는 이유는

Garbage colletion을 수행하거나 Closure를 지원하고자 할 때
스택을 fork 하거나, 여러 실행 흐름을 만들고자 할 때 (예시로 협업형 스케줄링이 있음)
다양한 메모리 모델을 구현하려고 할 때

이를 통해 시스템 스택 사용을 최소화하고자 한다.

이런 경우의 언어들은 시스템 스택 대신 사용자 정의 스택 또는 힙 구조를 이용해야하나 LLVM은 본래 시스템 스택만을 사용하기 때문에 이를 직접적으로 지원하지 않는다.

해결법

Tail Call + Continuation Passing Style (CPS)

모든 함수 호출을 Tail Call(꼬리 호출)로 만듦
Continuation Passing Style (CPS)로 코드를 변환
시스템 스택을 쓰지 않고도 실행 흐름을 관리할 수 있게 함

CPS? 함수가 결과를 직접 반환하는 대신, 다음 작업(continuation)을 인자로 받아 전달하는 방법

//===----------------------------------------------------------------------===//
//                  Explicitly Managed Stack Frames in LLVM
//===----------------------------------------------------------------------===//

Sep 5, 2004 - Initial revision

Some languages are not content to use the program stack to run their programs:
for whatever reason, they want to manage their stacks explicitly (common reasons
include garbage collected closures, and languages that permit forking stacks,
though there are probably countless others).  As of this writing, LLVM provides
no mechanisms to implement this, and this is a deficiency in LLVM.

This note describes how support for correct tail calls permits the program
to explicit manage its own stack in an arbitrary way (assuming that that machine
stack still exists, and any calls to external C functions use it).


//===----------------------------------------------------------------------===//
// The idea
//

The idea is trivially simple: given guaranteed efficient tail call support, all
calls must use the explicitly managed stack, and all calls are transformed by
the front-end into tail calls.  This first transformation is trivial, as is the
second.

The idea is to transform functions to use continuations instead of standard
calls for non-tail function calls in the program.  Consider this
pseudo-function:


int foo() {
  int X = 4
  int Y = bar(14);
  return X*Y;
}

Assuming the function is unoptimized, the call to bar is not a tail-call: it
does not occur in the tail position. Also, the call to bar may well access the
value contained by X through language-specific means.

Because of this, the stack frame for foo must be explicitly manages (e.g. by a
garbage collector).  It is straight-forward to transform foo into the following
function:

int foo() {
  fooSF = gcalloc { X: int; Y: int }
  fooSF->X = 4;
  fooSF->Y = bar(14);
  return fooSF->X * fooSF->Y;
}

This successfully decouples the storage of X from the stack to an explicitly
managed piece of memory (in this case the GC heap), but the call to bar still
occurs on the system stack (e.g. 14 and the return addrss in foo may be pushed
onto the stack).  To rectify this, we transform the function into continuation
passing style, a well known process that I will not describe here in any more
details (google can tell you lots about it, I'm sure).  Transformed, this
function might look like this:

int foo(continuation C) {
  fooSF = gcalloc { C: continuation; X: int; Y: int }
  fooSF->C = C;
  fooSF->X = 4;
  continuation Next = { foo2, fooSF };
  return bar(Next, 14);
}

int foo2(continuation C, int Y) {
  fooSF = C->SF;
  fooSF->Y = Y;
  continuation Next = fooSF->C;
  return Next->F(Next, fooSF->X*fooSF->Y);
}

To someone unfamiliar with CPS, the transformation above will look very strange.
However, note that now both calls in the resultant functions occur as tail
calls.  The result of applying these two transformations to the whole program
will result in a program that runs in a constant amount of system stack space,
and whose program stack can be managed by any means required by the
source-language.

[LLVM tutorial] 01 ~ 02

snejs — Tue, 8 Jul 2025 13:44:40 +0900

LLVM tutorial

이 글은 LLVM tutorial의 아래 두 챕터를 바탕으로 정리한 글이다.

https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl01.html

1. Kaleidoscope: Kaleidoscope Introduction and the Lexer — LLVM 21.0.0git documentation

This tutorial is illustrated with a toy language called “Kaleidoscope” (derived from “meaning beautiful, form, and view”). Kaleidoscope is a procedural language that allows you to define functions, use conditionals, math, etc. Over the course of th

llvm.org

https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.html

2. Kaleidoscope: Implementing a Parser and AST — LLVM 21.0.0git documentation

With just under 400 lines of commented code (240 lines of non-comment, non-blank code), we fully defined our minimal language, including a lexer, parser, and AST builder. With this done, the executable will validate Kaleidoscope code and tell us if it is g

llvm.org

1.1. The Kaleidoscope Language

Kaleidoscope 언어는 LLVM tutorial을 설명하기 위한 toy language로, 조건문, 반복문, 사용자 정의 연산자 및 간단한 명령줄 인터페이스를 통한 JIT 컴파일, 디버그 정보 등까지 확장한다.

Kaleidoscope는 매우 단순한 체계의 언어이다.

- 64bit 부동 소수점(double)의 유일한 data type을 사용하며, 단일 타입을 가지는만큼 타입 선언도 없다.

# Compute the x'th fibonacci number.
def fib(x)
  if x < 3 then
    1
  else
    fib(x-1)+fib(x-2)

# This expression will compute the 40th number.
fib(40)

- 표준 라이브러리 함수를 호출할 수 있으며, 'extern' 키워드를 사용하여 함수를 사용하기 전에 정의할 수 있다.

- LLVM JIT이 외부 함수들을 연결, 호출하는 작업을 쉽게해주어 외부 표준 라이브러리 함수들을 불러오는게 간편해졌다.

extern sin(arg);
extern cos(arg);
extern atan2(arg1 arg2);

atan2(sin(.4), cos(42))

1.2. Lexer

Lexer는 텍스트 파일의 입력을 받아서, 그것을 해석할 수 있도록 토큰 단위로 잘라주는 역할을 한다.

구현은 C 언어를 기반으로 한다.

- input : text file

- output : tokens (Token enum values OR unknown character)

- token : token code + metadata

- token code : token의 종류 (ex) tok_number)

- metadata : token의 실제 값 (ex) 42.0)

// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
// of these for known things.
enum Token {
  tok_eof = -1,

  // commands
  tok_def = -2,
  tok_extern = -3,

  // primary
  tok_identifier = -4,
  tok_number = -5,
};

static std::string IdentifierStr; // Filled in if tok_identifier
static double NumVal;             // Filled in if tok_number

Lexer로부터 반환되는 Token enum 값들에 포함되지 않은 unknown character들은 (ex) '+') ASCII value로 반환된다.

Lexer의 실제구현은 다음 gettok() 한 개의 함수에 구현되어 있다.

- gettok() 함수는 입력을 읽고 다음 토큰을 만들어서 반환하는 역할을 하며, static 변수로 선언되는 LastChar 변수는 문자 및 EOF 값을 저장하며, 읽고 처리하지 않은 문자가 저장되어 있다.

gettok()의 절차

1. token 사이의 whitespace는 무시

/// gettok - Return the next token from standard input.
static int gettok() {
  static int LastChar = ' ';

  // Skip any whitespace.
  while (isspace(LastChar))
    LastChar = getchar();

2. identifier의 처리 : 문자로 시작하는 모든 token이 여기에 해당하며, def와 extern의 키워드들을 개별적인 토큰으로 반환

if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]*
  IdentifierStr = LastChar;
  while (isalnum((LastChar = getchar())))
    IdentifierStr += LastChar;

  if (IdentifierStr == "def")
    return tok_def;
  if (IdentifierStr == "extern")
    return tok_extern;
  return tok_identifier;
}

3. numeric value의 처리 : 숫자로 시작하는 double 값을 저장한 뒤 tok_number 반환

(strtod에 의해 유효하지 않은 값을 제거됨, 1.23.45.67 -> 1.23)

if (isdigit(LastChar) || LastChar == '.') {   // Number: [0-9.]+
  std::string NumStr;
  do {
    NumStr += LastChar;
    LastChar = getchar();
  } while (isdigit(LastChar) || LastChar == '.');

  NumVal = strtod(NumStr.c_str(), 0);
  return tok_number;
}

4. comment는 문장의 끝을 찾아 EOF 토큰 혹은 다음 문장의 첫 토큰을 반환

if (LastChar == '#') {
  // Comment until end of line.
  do
    LastChar = getchar();
  while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');

  if (LastChar != EOF)
    return gettok();
}

5. EOF이면 tok_eof를 반환하고 어디에도 해당되지 않은 문자열은 ASCII 값 그대로 반환

  // Check for end of file.  Don't eat the EOF.
  if (LastChar == EOF)
    return tok_eof;

  // Otherwise, just return the character as its ascii value.
  int ThisChar = LastChar;
  LastChar = getchar();
  return ThisChar;
}

2.1. Chapter 2 Introduction

Chapter 2 에서 다루는 것 : AST의 정의, Kaleidoscope language의 paser의 build

여기에서의 parser는 Recursive Descent Parsing(이항 표현식 외 전체 구현)과 Operater-Precedence Parsing(이 표현식을 구현)으로 구현된다.

2.2. The Abstract Syntax Tree (AST)

Kaleidoscope의 문법은 표현식(expression), 프로토타입(prototype), 함수 정의(function)로 크게 세 가지로 나뉘며, 이에 대응하는 AST 노드를 각각 구현한다.

Expression

- ExprAST : 모든 expression node 들에 대한 기본이 되는 interface 역할을 하는 class

- NumberExprAST : 숫자 리터럴을 표현하기 위한 class, 해당 리터럴의 실제 값을 저장

- VariableExprAST : 변수 참조를 표현하는 class, 변수의 이름을 저장하여 참조할 수 있게 함
- BinaryExprAST : binary operator을 표현하는 class, 연산자와 좌우 피연산자의 expression을 저장

- CallExprAST : function call을 표현하는 class, 호출할 함수의 이름과 전달할 인자 목록을 저장

/// ExprAST - Base class for all expression nodes.
class ExprAST {
public:
  virtual ~ExprAST() = default;
};

/// NumberExprAST - Expression class for numeric literals like "1.0".
class NumberExprAST : public ExprAST {
  double Val;

public:
  NumberExprAST(double Val) : Val(Val) {}
};
/// VariableExprAST - Expression class for referencing a variable, like "a".
class VariableExprAST : public ExprAST {
  std::string Name;

public:
  VariableExprAST(const std::string &Name) : Name(Name) {}
};

/// BinaryExprAST - Expression class for a binary operator.
class BinaryExprAST : public ExprAST {
  char Op;
  std::unique_ptr<ExprAST> LHS, RHS;

public:
  BinaryExprAST(char Op, std::unique_ptr<ExprAST> LHS,
                std::unique_ptr<ExprAST> RHS)
    : Op(Op), LHS(std::move(LHS)), RHS(std::move(RHS)) {}
};

/// CallExprAST - Expression class for function calls.
class CallExprAST : public ExprAST {
  std::string Callee;
  std::vector<std::unique_ptr<ExprAST>> Args;

public:
  CallExprAST(const std::string &Callee,
              std::vector<std::unique_ptr<ExprAST>> Args)
    : Callee(Callee), Args(std::move(Args)) {}
};

Conditional control flow 구현되지 않음

Prototype & Function

- PrototypeAST : 함수의 시그니처 부분 (ex) def func1 (arg1, arg2))
- FunctionAST : 함수의 시그니처 + body

Kaleidoscope에서는 데이터 타입이 한 개 이므로 함수의 타입은 파라미터 개수만으로 결정된다.

/// PrototypeAST - This class represents the "prototype" for a function,
/// which captures its name, and its argument names (thus implicitly the number
/// of arguments the function takes).
class PrototypeAST {
  std::string Name;
  std::vector<std::string> Args;

public:
  PrototypeAST(const std::string &Name, std::vector<std::string> Args)
    : Name(Name), Args(std::move(Args)) {}

  const std::string &getName() const { return Name; }
};

/// FunctionAST - This class represents a function definition itself.
class FunctionAST {
  std::unique_ptr<PrototypeAST> Proto;
  std::unique_ptr<ExprAST> Body;

public:
  FunctionAST(std::unique_ptr<PrototypeAST> Proto,
              std::unique_ptr<ExprAST> Body)
    : Proto(std::move(Proto)), Body(std::move(Body)) {}
};

2.3. Parser Basics

Parser 파트에서는 lexer 파트에서 구현한 gettok()을 사용하여 토큰을 바탕으로 AST를 생성하는 코드를 설명한다.

Parser 구현을 위한 basic helper routines

- CurTok : 현재 파싱해야하는 토큰

- getNextToken() : gettok() 함수를 호출하여 CurTok에 다음으로 파싱할 토큰을 저장

- LogError() : 파싱 과정 중 오류 발생 시에 호출되는 에러 처리 함수. 다양한 반환 타입을 갖는 파싱 함수 내에서 일관된 방식으로 오류를 처리할 수 있도록 도와줌

/// CurTok/getNextToken - Provide a simple token buffer.  CurTok is the current
/// token the parser is looking at.  getNextToken reads another token from the
/// lexer and updates CurTok with its results.
static int CurTok;
static int getNextToken() {
  return CurTok = gettok();
}

/// LogError* - These are little helper functions for error handling.
std::unique_ptr<ExprAST> LogError(const char *Str) {
  fprintf(stderr, "Error: %s\n", Str);
  return nullptr;
}
std::unique_ptr<PrototypeAST> LogErrorP(const char *Str) {
  LogError(Str);
  return nullptr;
}

2.4. Basic Expression Parsing

이 파트에서는 가장 간단하게 구현되는 리터럴 수를 파싱하는 것을 구현하는 것부터 파싱 과정에서 사용되는 방법들을 소개한다.

아래 ParseNumberExpr에서는 tok_number 토큰에서 NumVal 값을 가져와 NumberExprAST 노드를 생성하고 현 토큰은 소비한 후 노드를 반환한다.

여기서 parser는 node 생성에 필요한 모든 토큰을 처리한 후 다음 토큰을 lexer buffer에 반환(getNextToken())하고 있으며, 이는 Recursive Descent Parser의 표준 방식이다.

/// numberexpr ::= number
static std::unique_ptr<ExprAST> ParseNumberExpr() {
  auto Result = std::make_unique<NumberExprAST>(NumVal);
  getNextToken(); // consume the number
  return std::move(Result);
}

ParseParenExpr은 '(' expression ')' 의 괄호를 처리하는 함수로, expression과 ')'이 오류가 났을 시 각각에 대한 대응을 nullptr 반환과 LogError를 반환하는 것으로 보여주고 있다.

모든 토큰이 오류 없이 parsing되면 각 토큰을 소모하고 노드를 반환한다. 이 때 AST에서는 괄호를 제거하고 expression만 노드에 저장하게 되는데 이는 괄호가 표현식의 우선순위와 그룹핑을 위한 도구로만 사용될 뿐 어떠한 의미를 담고 있지 않기 때문에 제거되어도 영향을 끼치지 않기 때문이다.

/// parenexpr ::= '(' expression ')'
static std::unique_ptr<ExprAST> ParseParenExpr() {
  getNextToken(); // eat (.
  auto V = ParseExpression();
  if (!V)
    return nullptr;

  if (CurTok != ')')
    return LogError("expected ')'");
  getNextToken(); // eat ).
  return V;
}

또한 위의 코드에서 ParseExpression을 호출하여 expression에 대한 처리를 다른 함수에 맡기는 것을 볼 수 있다. 이는 호출을 통해 재귀를 사용하는 Recursive Decent Parser의 특징을 보여주며, 이러한 특징을 바탕으로 괄호가 중첩되는 구조도 쉽게 처리할 수 있다.

다음은 Identifier에 대한 처리를 담당하는 함수이다. identifier에는 변수 참조와 함수 호출의 두가지 패턴이 있으며 이를 한 함수에서 조건문 분기를 통해 각각 적절한 노드를 반환하는 것을 확인할 수 있다.

조건문 분기 과정에서 look-ahead를 사용하며 다음 토큰을 미리 보고 '('와의 일치 여부에 따라 생성되는 노드가 갈린다.

/// identifierexpr
///   ::= identifier
///   ::= identifier '(' expression* ')'
static std::unique_ptr<ExprAST> ParseIdentifierExpr() {
  std::string IdName = IdentifierStr;

  getNextToken();  // eat identifier.

  if (CurTok != '(') // Simple variable ref.
    return std::make_unique<VariableExprAST>(IdName);

  // Call.
  getNextToken();  // eat (
  std::vector<std::unique_ptr<ExprAST>> Args;
  if (CurTok != ')') {
    while (true) {
      if (auto Arg = ParseExpression())
        Args.push_back(std::move(Arg));
      else
        return nullptr;

      if (CurTok == ')')
        break;

      if (CurTok != ',')
        return LogError("Expected ')' or ',' in argument list");
      getNextToken();
    }
  }

  // Eat the ')'.
  getNextToken();

  return std::make_unique<CallExprAST>(IdName, std::move(Args));
}

ParsePrimary는 expression-parsing에서 가장 상위의 primary를 파싱하는 함수이다.

현재 토큰을 확인하여 token 종류에 따라 함수를 호출한다. 여기서 CurTok은 처리되지 않은 토큰을 가리키고 있기 때문에 다음 토큰을 가져올 필요 없이 바로 확인 후 분기할 수 있다.

/// identifierexpr
///   ::= identifier
///   ::= identifier '(' expression* ')'
static std::unique_ptr<ExprAST> ParseIdentifierExpr() {
  std::string IdName = IdentifierStr;

  getNextToken();  // eat identifier.

  if (CurTok != '(') // Simple variable ref.
    return std::make_unique<VariableExprAST>(IdName);

  // Call.
  getNextToken();  // eat (
  std::vector<std::unique_ptr<ExprAST>> Args;
  if (CurTok != ')') {
    while (true) {
      if (auto Arg = ParseExpression())
        Args.push_back(std::move(Arg));
      else
        return nullptr;

      if (CurTok == ')')
        break;

      if (CurTok != ',')
        return LogError("Expected ')' or ',' in argument list");
      getNextToken();
    }
  }

  // Eat the ')'.
  getNextToken();

  return std::make_unique<CallExprAST>(IdName, std::move(Args));
}

2.5. Binary Expression Parsing

이 파트에서는 이항 연산자가 포함된 표현식에 대해 parsing하는 과정을 다루며 Operator-Precedence Parsing 방법을 사용한다.

Kaleidoscope language의 기본 형태에서는 4개의 이 연산자 (<, +, -, *)만을 지원하며, 각 연산자에 맞는 우선순위를 반환하는 함수인 GetTokPrecedence를 사용할 수 있다. 4개의 연산자 외에는 -1이 반환되도록 설계되어있다.

/// BinopPrecedence - This holds the precedence for each binary operator that is
/// defined.
static std::map<char, int> BinopPrecedence;

/// GetTokPrecedence - Get the precedence of the pending binary operator token.
static int GetTokPrecedence() {
  if (!isascii(CurTok))
    return -1;

  // Make sure it's a declared binop.
  int TokPrec = BinopPrecedence[CurTok];
  if (TokPrec <= 0) return -1;
  return TokPrec;
}

int main() {
  // Install standard binary operators.
  // 1 is lowest precedence.
  BinopPrecedence['<'] = 10;
  BinopPrecedence['+'] = 20;
  BinopPrecedence['-'] = 20;
  BinopPrecedence['*'] = 40;  // highest.
  ...
}

이항 연산자가 포함된 표현식에 대한 처리는 그 표현식을 [binop, primaryexpr] 쌍의 여러 조각으로 나눠 시퀀스를 만들어 낸다.

예시 : “a+b+(c+d)*e*f+g”

[binop, primaryexpr] sequence : [+, b] [+, (c+d)] [*, e] [*, f], [+, g]

위의 예에서 'a'는 먼저 구문 분석된 상태이며, 시퀀스에서 각 연산자의 우선순위를 비교하며 primaryexpr 간의 node를 구성하고 연결한다.

'(c+d)'의 경우 expression으로서 내부적으로 처리되기 때문에 해당 단계에서 고려할 필요가 없다.

아래 코드는 a + b ... 에서 a의 구문 분석을 마친 후 우선순위 0을 준 뒤 오른쪽에 따라오는 연산들을 처리한다.

/// expression
///   ::= primary binoprhs
///
static std::unique_ptr<ExprAST> ParseExpression() {
  auto LHS = ParsePrimary();
  if (!LHS)
    return nullptr;

  return ParseBinOpRHS(0, std::move(LHS));
}

나머지 '+b+(c+d)*e*f+g'를 [0, a] 인자를 갖고 처리하는 과정이다.

/// binoprhs
///   ::= ('+' primary)*
static std::unique_ptr<ExprAST> ParseBinOpRHS(int ExprPrec,
                                              std::unique_ptr<ExprAST> LHS) {
  // If this is a binop, find its precedence.
  while (true) {
    int TokPrec = GetTokPrecedence(); // 현재 토큰이 연산자일 시 우선순위를 가져옴

    // If this is a binop that binds at least as tightly as the current binop,
    // consume it, otherwise we are done.
    // 현재 연산자가 ExprPrec보다 낮으면 반환
    // ex) a * b + c
    // * 보다 +의 우선 순위가 낮음 > * 까지만 계산하고 반환 
    if (TokPrec < ExprPrec) 
      return LHS;

    // Okay, we know this is a binop.
    int BinOp = CurTok; // 현재 이항 연산자 저장
    getNextToken(); // eat binop

    // Parse the primary expression after the binary operator.
    auto RHS = ParsePrimary(); // 피연산자 파싱
    if (!RHS)
      return nullptr;

    // If BinOp binds less tightly with RHS than the operator after RHS, let
    // the pending operator take RHS as its LHS.
    // a + b * c
    int NextPrec = GetTokPrecedence(); 
    if (TokPrec < NextPrec) {	
      // 다음 연산자의 우선순위가 더 높으면 b * c 를 먼저 완성 해야하므로 함수 호출
      // 피연산자에 있는 연산자의 우선순위가 더 높은 상태 a + b * c * d
      // (TokPrec + 1, => 함수를 재귀적으로 호출하여 우선순위를 높임으로써 
      // b * c * d 가 한 묶음으로 묶이도록 함
      RHS = ParseBinOpRHS(TokPrec + 1, std::move(RHS));
      if (!RHS)
        return nullptr;
    }

    // Merge LHS/RHS.
    LHS =
        std::make_unique<BinaryExprAST>(BinOp, std::move(LHS), std::move(RHS));
  }
}

2.6. Parsing the Rest

마지막으로 함수 prototype 처리 부분으로 이는 함수의 시그니처를 표현하는 부분이다.

prototype은 함수 본문 정의에도 사용되지만 'extern'을 사용한 함수 선언에도 사용된다.

prototype의 정의는 다음과 같다.

/// prototype
///   ::= id '(' id* ')'
static std::unique_ptr<PrototypeAST> ParsePrototype() {
  if (CurTok != tok_identifier)
    return LogErrorP("Expected function name in prototype");

  std::string FnName = IdentifierStr;
  getNextToken();

  if (CurTok != '(')
    return LogErrorP("Expected '(' in prototype");

  // Read the list of argument names.
  std::vector<std::string> ArgNames;
  while (getNextToken() == tok_identifier)
    ArgNames.push_back(IdentifierStr);
  if (CurTok != ')')
    return LogErrorP("Expected ')' in prototype");

  // success.
  getNextToken();  // eat ')'.

  return std::make_unique<PrototypeAST>(FnName, std::move(ArgNames));
}

위에 구현한 prototype에 더불어 함수의 본문을 정의하는 경우를 대비해 함수 본문을 구현하는 것을 덧붙이는 것으로 완성한다.

각 'def'와 'extern' 키워드에 대한 처리이다.

/// definition ::= 'def' prototype expression
static std::unique_ptr<FunctionAST> ParseDefinition() {
  getNextToken();  // eat def.
  auto Proto = ParsePrototype();
  if (!Proto) return nullptr;

  if (auto E = ParseExpression())
    return std::make_unique<FunctionAST>(std::move(Proto), std::move(E));
  return nullptr;
}
/// external ::= 'extern' prototype
static std::unique_ptr<PrototypeAST> ParseExtern() {
  getNextToken();  // eat extern.
  return ParsePrototype();
}

마지막으로 익명 함수 트릭을 써서 호출 없이 결과를 내도록 하는 방법을 사용한 parsing 이다.

사용자가 1 + 2 * 3 을 입력했을 때 함수 호출 없이 바로 실행하여 출력하게 하고자 할 때 사용할 수 있다.

static std::unique_ptr<FunctionAST> ParseTopLevelExpr() {
  if (auto E = ParseExpression()) {
    // Make an anonymous proto.
    auto Proto = std::make_unique<PrototypeAST>("", std::vector<std::string>());
    return std::make_unique<FunctionAST>(std::move(Proto), std::move(E));
  }
  return nullptr;
}

2.7. The Driver

아래는 모든 파싱을 호출할 수 있는 최상위 루프 함수이다.

주목할만한 점은 최상위 세미콜론을 무시하는것으로, 이는 파서에게 입력이 끝났음을 알려주는 역할을 한다.

/// top ::= definition | external | expression | ';'
static void MainLoop() {
  while (true) {
    fprintf(stderr, "ready> ");
    switch (CurTok) {
    case tok_eof:
      return;
    case ';': // ignore top-level semicolons.
      getNextToken();
      break;
    case tok_def:
      HandleDefinition();
      break;
    case tok_extern:
      HandleExtern();
      break;
    default:
      HandleTopLevelExpression();
      break;
    }
  }
}

전체 코드

LLVM tutorial 2.9. 참고

원격 접속해서 코드 난독화하기

snejs — Fri, 16 May 2025 14:00:49 +0900

노트북에서의 한계

개인 노트북으로 .c 코드를 난독화 및 컴파일하여 데이터셋을 만드는 과정을 수행했었다.

그런데 데이터셋 규모가 커지면서 하드 용량이 4GB 밖에 남지 않는 상황에 놓이게 되어, 더 이상 개인 노트북으로는 감당이 어렵게되었다.

이를 해결하기 위해 연구실 컴퓨터를 요청하여 원격 접속으로 난독화를 수행하기로 했다.

SSH로 원격 접속하기

https://winterbloooom.github.io/computer%20science/linux/2022/03/22/ssh.html

[Ubuntu] SSH로 원격접속 하기 | winterblooom (EunGi Han)

사용 환경 서버: Ndivia Jetson AGX Xavier, Ubuntu 18.04 LTS 클라이언트: 일반 노트북, Ubuntu 18.04 LTS LAN 선, 인터넷 공유기 필요 목적 ROS가 돌아갈 메인 PC인 Jetson이 있고, 거리가 조금 떨어진 원격지에서 노

winterbloooom.github.io

자세한 내용은 위 블로그를 참고하면서 했으며, 기본적인 것은 이미 구성되어 있는 상태였기 때문에

클라이언트 입장에서 접속하기만 하면 됐다.

원격 접속은 ssh -Y 서버유저명@서버IP주소 를 이용하면 된다.

우분투 컨테이너 실행 및 접속

https://velog.io/@woody35545/Docker%EB%A1%9C-Ubuntu-%EC%82%AC%EC%9A%A9%ED%95%B4%EB%B3%B4%EA%B8%B0

Docker로 Ubuntu 사용해보기

m1 mac 환경에서 Docker에 Ubuntu 컨테이너를 생성한 후 간단하게 사용해보자

velog.io

원격에서는 우분투 컨테이너를 이용해서 작업하기로 했다. 이 또한 미리 설치되어 있어 추가 작업 없이 바로 사용할 수 있었다.

docker ps -a

위 코드를 통해 우분투 컨테이너 상태를 확인할 수 있다.

우분투 컨테이너를 사용하기 위해서는 컨테이너를 실행하고 접근하면 된다.

ubuntu_test 라는 컨테이너로 접속할 것이기 때문에 다음 코드를 사용해서 접속 할 수 있었다.

# 컨테이너를 실행
docker start ubuntu_test

# 컨테이너에 접속
docker attach ubuntu_test

우분투에 접속한 이후에는 난독화를 수행하기 위해 가장 기초적인 설치 및 설정 부터 개인 노트북에서 수행했던

tigress 설치, gcc 컴파일러 설치, make 설치 등을 수행했다.

데이터 옮기기

난독화해야 하는 데이터들은 원격의 우분투 컨테이너로 파일을 옮겨야 한다.

1차적으로 로컬에서 원격으로 옮기고 2차로 원격에서 우분투 컨테이너로 옮기는 2번의 작업이 요구되었다.

로컬 ↔ 원격

'scp [옮기고자 하는 대상] [옮길 위치]' 형식으로 사용하면 되며, 파일은 한번에 여러개를 나열하여 한번에 여러개도 보낼 수 있다.

또한 -r 옵션을 달아주면 디렉토리도 옮길 수 있다.

# 로컬에서 원격으로 파일 옮기기
scp [파일이름] [원격 사용자명]@[ip]:[원격에서 받는 위치]

# 원격에서 로컬로 파일 옮기기
scp [원격 사용자명]@[ip]:[원격 파일 위치]/[파일이름] [로컬에서 받는 위치]

# 디렉토리 옮기기
scp -r [파일이름] [원격 사용자명]@[ip]:[원격에서 받는 위치]

원격 ↔ 우분투 컨테이너

우분투 컨테이너는 docker cp 를 사용하면 파일을 복사할 수 있으며, 복사되는 파일은 한번에 하나씩만 가능하다.

# 원격에서 우분투 컨테이너로
docker cp [파일 이름] [우분투 컨테이너 이름]:[파일 받는 위치] 

# 우분투 컨테이너에서 로컬로
docker cp [우분투 컨테이너 이름]:[파일위치/파일이름] [파일 받는 위치(원격)]

난독화하기

난독화하기에 앞서 몇 가지 수정해야할 사항이 추가되었다.

데이터셋의 규모가커지면서 더 이상 makefile만을 사용해서 난독화를 자동화하는 것이 불가능해졌다.

명령어가 길어지는 문제를 해결하기 위해 기존 makefile에서 작성된 내용을 transform.sh 파일로 바꿔서 추가하고

하위 폴더에서 makefile을 이용해 transform.sh 파일을 실행해주도록 구성했다.

디렉토리 구성

project
├── transform.sh   
├── src
    ├── Makefile
    ├── Code.c

transform.sh

#!/bin/bash
# Usage: ./transform.sh <mode> <src_list_file> <output_dir> <tigress_path>

MODE="$1"
SRC_LIST="$2"
OUT_DIR="$3"
TIGRESS="$4"

mkdir -p "$OUT_DIR"
split -l 100 "$SRC_LIST" chunk_

for chunk in chunk_*; do
    while read -r file; do
        echo "[*] Processing $file..."
        base=$(basename "$file" .c)

        case "$MODE" in
            virtualize)
                "$TIGRESS" --Transform=Virtualize --Functions=* --out="$OUT_DIR/${base}_virtualize.c" "$file"
                ;;
            flatten)
                "$TIGRESS" --Transform=Flatten --Functions=* --out="$OUT_DIR/${base}_flatten.c" "$file"
                ;;
            opaque)
                "$TIGRESS" \
                    --Transform=InitOpaque \
                    --Functions=* \
                    --Transform=AddOpaque \
                    --Functions=* \
                    --out="$OUT_DIR/${base}_opaque.c" "$file"
                ;;
        esac
    done < "$chunk"
done

rm -f chunk_*

Makefile

TIGRESS = tigress
SRC_LIST_FILE = .src_list.txt
SRC_FILES = $(wildcard *.c)
TRANSFORM_SCRIPT = ../transform.sh  # 상위 폴더 기준 경로

define run_transform
	@echo $(SRC_FILES) | tr ' ' '\n' > $(SRC_LIST_FILE)
	@$(TRANSFORM_SCRIPT) $(1) $(SRC_LIST_FILE) $(2) $(TIGRESS)
	@rm -f $(SRC_LIST_FILE)
endef

virtualize:
	$(call run_transform,virtualize,virtualize)

flatten:
	$(call run_transform,flatten,flatten)

opaque:
	$(call run_transform,opaque,opaque_predicate)

clean:
	rm -f *_virtualize.c *_flatten.c *_opaque.c
	rm -rf virtualize flatten opaque_predicate
	rm -f chunk_* .src_list.txt

Makefile + .sh 스크립트를 통해 명령어가 길어지는 문제를 회피할 수 있으며, 커진 데이터셋 규모에서도

make [난독화 종류]

의 방식으로 실행할 수 있었다.

난독화 과정에서의 어려움

크기가 큰 파일의 경우 난독화가 오래 걸리는 문제로 이를 방지하기 위해 150KB 이상의 파일들은 제외하고 난독화하였다.

약 300KB 크기의 파일을 난독화 할 시 10 ~ 20 분 정도의 시간이 소요되는 것을 확인하였다.

제외한 이후에는 각 파일을 난독화 할 때의 시간이 5분 내외 정도 소요되었다.

원래는 Flatten, Opaque Predicate로 난독화 했어야하는데 이를 잘못하여 Virtualize와 Flatten 으로 난독화하여

현재 Opaque Predicate 난독화 코드 파일은 다른 코드에 비해 그 양이 적은 상태이다. (switch 코드만)

난독화 결과

Switch 코드 :

- AtCoder 800개

- computational_switch 10000개

- nested_switch 10000개 → 3862개 (150KB 파일 크기 제한)

위 코드를 Vitualize, Flatten으로 난독화 후 컴파일 하여 총 43986개를 (switch 코드 * 3) 생성하였다.

LLM switch문 탐지 여부 분석 - Vitualize + a

snejs — Fri, 9 May 2025 14:02:38 +0900

난독화 도구 : tigress

Environment : x86_64, Linux, Gcc

난독화 기법 : Virtualize + [ EncodeLiterals, OpaquePredicate, Split ]

사용한 코드 :

test_loops_arrays.c

test_recursion.c

test_struct.c

test_file_io.c

test_multithreading.c

LLM : Claude 3.7

질문 내용 : 이 코드는 switch 문을 사용했니? 예 아니오로 답하고, 근거를 들어줘.

분석 결과

Virtualize만 적용했던 난독화 코드를 기준으로 각각의 난독화를 중첩했을 때 Claude가 대답한 결과이다.

Claude 대답 분석

test_struct.c - Vitualize + EncodeLiterals

주요 사유 : Jump Table과 switch 문 간의 연결 실패

Jump Table을 사용하여 구현한 것은 분석해냈으나 그 구현이 switch문을 사용한 것에 대해 연결짓지 못해 아니오 라고 답한 것으로 추정됨.

(같은 난독화를 적용한) 다른 코드에 대해서 Jump Table을 동일하게 분석해내고 switch문과 연결지어 "예" 라고 답한 경우

두 사례를 바탕으로 Claude는 Jump Table과 switch 문 두 가지를 별개로 두고 'switch 문을 컴파일 시 일반적으로 Jump Table을 사용한다'는 추정을 하고 있는 것 같다.

test_recursion.c - Vitualize + Opaque Predicate

주요 사유 : if-else 분기 구조 사용

test_recursion.c - Vitualize + Split

주요 사유 : 연속된 if-else 분기 구조 사용하고 있으나 명시적인 switch 문(Jump Table 구조)을 찾을 수 없음

test_multithreading .c - Vitualize + Split

주요 사유 : 연속된 if-else 분기 구조 사용하고 있으나 명시적인 switch 문(Jump Table 구조)을 찾을 수 없음

특이사항 : 생성된 바이너리 코드의 길이 문제로 일부 잘라서 입력하였다. 이로인해 처음 Claude에게 코드를 제공할 때 일부 잘린 코드가 있었으며, 이를 다시 제공하니 if-else 분기 구조로 분석하던 코드를 Jump Table 구조를 사용한 코드로 재해석한 결과를 받았다.

생략되었던 코드 부분 : Jump Table 부분이 거의 생략되어 입력됨.

	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L21-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L20-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L19-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L18-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L17-.L14
	.long	.L16-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L39-.L14
	.long	.L15-.L14
	.long	.L13-.L14
	.text
.L12:
	subl	$3, %eax
	cmpl	$39, %eax
	ja	.L39
	movl	%eax, %eax
	leaq	0(,%rax,4), %rdx
	leaq	.L24(%rip), %rax
	movl	(%rdx,%rax), %eax
	cltq
	leaq	.L24(%rip), %rdx
	addq	%rdx, %rax
	notrack jmp	*%rax
	.section	.rodata
	.align 4
	.align 4
.L24:
	.long	.L32-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L31-.L24
	.long	.L30-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L29-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L28-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L27-.L24
	.long	.L26-.L24
	.long	.L25-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L39-.L24
	.long	.L23-.L24
	.text
.L11:
	cmpl	$92, %eax
	je	.L33
	jmp	.L39
.L19:
	movq	-640(%rbp), %rax
	addq	$1, %rax
	movq	%rax, -640(%rbp)
	movq	-640(%rbp), %rax
	movq	-632(%rbp), %rdx
	addq	$16, %rdx
	movl	(%rax), %eax
	movl	%eax, (%rdx)

결론

Claude는 if-else 구조를 switch 문과의 연관성이 낮다고 보는 것으로 보이며, if-else 구조인가 혹은 Jump Table 구조인가가 switch 문 분석 결과에 중심적인 사유가 되는 것으로 판단됨.

또한, Jump Talbe이 사용된 것으로 판단되는 코드임에도 해당 코드와 switch 문과의 연관성을 스스로 판단하는 과정이 있어 이에 따라 긍정/부정의 답을 내놓는 것으로 보임.

LLVM Pass 따라해보기

snejs — Fri, 9 May 2025 14:02:21 +0900

지난번 " Ubuntu에서 LLVM 설치 및 빌드하기 "에서 환경변수부터 다시 시작하도록 한다.

환경 변수 설정 및 추가 빌드

지난번 환경 변수 설정에서 설정값이 잘못된 것을 확인하고 이를 바꿔줬다.

~/.bashrc 파일을 열어 다음 명령어를 추가하고 저장해준다.

export LLVM_BASE_DIR=/root/LLVM/llvm-project
export LLVM_SRC=$LLVM_BASE_DIR/llvm
export LLVM_OBJ=$LLVM_BASE_DIR/build
export LLVM_DIR=$LLVM_BASE_DIR/build   # CMake용
export PATH=$LLVM_OBJ/bin:$PATH

.bashrc 파일의 변경 사항을 바로 적용하기 위해 터미널에 다음 명령어를 입력한다.

source ~/.bashrc

정상 적용 확인

기존에 clang이 build 되지 않은 것을 확인

추가적으로 clang을 build 하도록 했다.

cmake -G Ninja -DLLVM_ENABLE_PROJECTS="clang" -DCMAKE_BUILD_TYPE=Release ../llvm

기존 cache 파일들 때문에 명령어가 실행 되질 않는다.

build 디렉토리 내부에 있는 cache 파일들을 제거하고 다시 실행 하면 된다.

rm -rf CMakeCache.txt CMakeFiles

혹은 build 폴더를 제거하고 다시 cmake를 실행하면 된다.

clang build 까지 마쳐주면 build/bin 에 clang 실행 파일들이 생성된 것을 확인 할 수 있다.

ninja clang

build가 잘 되었는지 확인

llvm-config --version
clang --version

Pass Tutorial

pass tutorial 과정은 다음 링크에서 설명한 과정을 그대로 따라하였다.

https://github.com/Masterwhiece/llvm8-tutorials-jemin

GitHub - Masterwhiece/llvm8-tutorials-jemin: llvm8-tutorials-jemin

llvm8-tutorials-jemin. Contribute to Masterwhiece/llvm8-tutorials-jemin development by creating an account on GitHub.

github.com

Practice 1

소스코드를 다운받고 설명 내용을 바탕으로 따라하던 도중 구버전의 명령어에 대한 warning이 발생했다.

-NamePrinter 문법은 사용하지 말고,

-passes=name-printer + -load-pass-plugin 방식으로 변경하여 사용할 필요가 있었다.

이를 위해 몇가지 다음과 같은 수정이 필요하다.

Makefile 수정

# 기존 컴파일 명령어
clang++ -c -fpic -fno-rtti `llvm-config --cppflags` NamePrinter.cpp -o NamePrinter.o

# 수정된 컴파일 명령어
clang++ -fPIC -shared -o NamePrinter.so NamePrinter.cpp `llvm-config --cxxflags --ldflags --system-libs --libs all`

# 수정된 Makefile.common

ifneq ($(LIBRARY),)
EXTRA_FLAGS=-fPIC
endif

OBJECTS=$(SOURCES:.cpp=.o)

.PHONY: all

all: $(LIBRARY) $(EXECUTABLE)

$(LIBRARY): $(OBJECTS)
	clang++ -shared -o $@ $^ `llvm-config --cxxflags --ldflags --system-libs --libs all`

$(EXECUTABLE): $(OBJECTS)
	clang++ $^ -o $@ `llvm-config --ldflags --libs --system-libs`

%.o: %.cpp
	clang++ -c $(EXTRA_FLAGS) `llvm-config --cppflags` $< -o $@

.PHONY: clean

clean:
	rm -f $(LIBRARY) $(EXECUTABLE) $(OBJECTS)

명령어 수정

# 수정된 코드
make clean && make
clang -c -emit-llvm test.c -o test.bc
opt -load-pass-plugin ./NamePrinter.so -passes=name-printer test.bc -o test.opt.bc

NamePrinter.cpp 수정

몇몇 코드의 수정을 반복하여도 여전히 구 버전의 코드를 사용한다는 사유로 제대로 실행되지 않아 여러번 코드를 수정하는 것을 반복하였다.

CMake 로 시도

동일한 사유로 실패

해석 : LLVM Pass를 동적 로딩(-load-pass-plugin)할 때 동일한 옵션이 중복 등록돼 충돌할 경우 발생

llvm-project 재설치 고려

Ubuntu에서 LLVM 설치 및 빌드하기

snejs — Fri, 2 May 2025 14:01:28 +0900

LLVM Install

LLVM 다운로드는 다음 링크에서 하거나 "llvm download" 라고 검색하면 동일한 페이지로 진입할 수 있다.

https://releases.llvm.org/download.html

LLVM Download Page

If you'd like access to the "latest and greatest" in LLVM development, please see the instructions for accessing the LLVM Git Repository. The major changes and improvements that the development version contains relative to the previous release are listed i

releases.llvm.org

링크로 진입하면 여러 LLVM 버전들이 게시되어 있다.

다운로드 할 LLVM 버전을 고르고, page를 클릭하면 git에서 LLVM 을 다운로드 할 수 있다.

가장 최신 버전인 18.1.8 버전을 다운로드했다.

일반적으로 LLVM 과 Clang 정도만 다운로드 하면 되지만,

이전에 여러 번 실패하여 그냥 llvm-project-18.1.8.src.tar.xz 를 다운로드하여 사용하도록 했다.

Build 까지 모두 성공한 시점에서 바라볼 때는 괜히 사용하지 않는 파일들을 설치하길 원하지 않는다면

두 개 파일만 다운로드 후 설치해도 좋을 것 같다.

다운로드한 파일은 ubuntu로 옮겨서 압축 해제해주고 디렉토리 이름을 llvm-project 로 변경해주었다.

# 압축 해제 명령어
tar -xvf <파일이름>.tar.xz

# 디렉토리 이름 변경
mv <변경대상이름> llvm-project

LLVM Build

LLVM 빌드 과정은 다음 링크에서 제공하는 설명을 바탕으로 상황에 맞춰서 수정해가며 진행하였다.

https://www.llvm.org/docs/GettingStarted.html#getting-started-with-llvm

Getting Started with the LLVM System — LLVM 21.0.0git documentation

lli is the LLVM interpreter, which can directly execute LLVM bitcode (although very slowly…). For architectures that support it (currently x86, Sparc, and PowerPC), by default, lli will function as a Just-In-Time compiler (if the functionality was compil

www.llvm.org

먼저 빌드 과정에 대해 설명하기 전에, build directory를 직접 생성하고 build 디렉토리에서 작업하는 방법이 있으나,

본인은 이 과정으로 진행하다 수십차례의 실패에 뭐가 옳고 그른지도 모르겠고 찾아보기만 하다가 최신 버전에서는 더 간단한 명령어로 가능하다는 것을 알아내어 버전업된 방법으로 빌드하였다.

이제 build를 하기위해서는 다음 명령어를 사용하면 된다.

-DCMAKE_BUILD_TYPE은 Debug로 설정해도 되지만 본인은 Debug로 설정했다가 컴퓨터 사양이 맞춰지지 못하여 Release로 바꿔서 다시 빌드했다.

명령어 실행 위치는 llvm-project에서 하면 된다.

cmake -S llvm -B build -G Ninja -DCMAKE_BUILD_TYPE=Release

참고로 Ubuntu에서 할당된 Memory는 4GB, 컴퓨터 CPU core 4, Memory 8GB로 그닥 무거운 프로그램을 돌리기에 좋진 않다.

cmake 실행 시에 다음과 같은 오류가 뜰 수 있는데 apt 버전이 안전하고 좋다고 하니 apt 명령어를 사용하는 버전으로 하여 cmake 설치를 하고 다시 진행하도록 했다.

(다음 그림은 기존 build 과정에서 찍은 것으로 명령어의 차이가 있으나, cmake가 설치되어 있지 않다면 다음과 같은 오류가 뜬다.)

sudo apt install cmake

또 위의 build 명령어에서 Ninja를 사용하여 build를 하고 있기 때문에 Ninja 설치도 해줬다.

해서 build를 원활히 끝내면 다음과 같은 화면을 만날 수 있다.

Build Test

이번에 진행된 모든 과정 중에 가장 오래걸리는 과정으로 만약 실행 환경의 사양이 좋지 못하다면 본인과 같은 어려움을 몇 차례 겪어야 할 지도 모른다.

기본 명령어는 다음과 같다.

ninja -C build check-llvm

그러면 다음과 같은 화면과 함께 test가 진행된다.

위 과정에서 test 중에 Memory가 부족해서 fatal error가 나면서 강제 종료 당했다.

강제 종료 당했을 시에는 해결 방법이 몇가지 있는데, 사양이 좋지 않다면 모두 적용하고 실행하는 것을 추천한다.

1. Debug -> Release : 앞서 말한 option에 대해 release 옵션으로 바꿔서 build 후 build test를 하면 훨씬 수월하게 test를 할 수 있다.

2. swap 늘리기

3. core 제한 옵션 달아서 실행하기

위의 방법 중에는 1번 3번 2번 순으로 효과적이라고 생각된다.

2번부터 자세히 설명하겠다.

먼저 memory와 swap 상태를 확인하기 위해 다음 명령어를 실행한다.

free -h

메모리 3.7 기가 Swap은 1GB로 메모리도 swap도 턱없이 부족한 상태.

급한대로 swap이라도 늘리고자 하여 다음 명령어를 사용하여 4GB로 늘렸다.

sudo swapoff -a
sudo dd if=/dev/zero of=/swapfile bs=1G count=4
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile

잘 늘어난 것을 확인할 수 있다.

3번은 옵션 다는 것이라 기존 명령어에서 -j 옵션을 추가해주면 된다.

각자 사양에 맞춰서 하면 되는 것 같고, 본인은 -j2로 돌리다가 안되서 -j1으로 test 하였다.

# 기존 명령어
ninja -C build check-llvm
# 2개 프로세스 병렬 실행 제한
ninja -C build -j2 check-llvm
# 3개 프로세스 병렬 실행 제한
ninja -C build -j3 check-llvm

다행히 build test는 중간에 끊기면 끊겼던 부분에서 다시 시작하는 듯 하다.

모든 test가 완료되면 다음과 같은 화면을 만날 수 있다.

환경 변수 설정하기

llvm-project : base directory

llvm : llvm 소스

clang : clang 소스

build : cmake ../llvm (빌드) 실행 directory, 실행 가능한 도구들이 생성된 directory

# setting for environment variables

export LLVM_BASE_DIR=/root/LLVM/llvm-project

export LLVM_SRC=$LLVM_BASE_DIR/llvm
export LLVM_SRC_ROOT=$LLVM_BASE_DIR/llvm
export LLVM_ROOT=$LLVM_BASE_DIR/llvm
export LLVM_OBJ=$LLVM_BASE_DIR/build
export LLVM_OBJ_DIR=$LLVM_BASE_DIR/build
export LLVM_OBJ_ROOT=$LLVM_BASE_DIR/build
export LLVM_INSTALL_DIR=$LLVM_BASE_DIR/llvm-install
export LLVM_DIR=$LLVM_BASE_DIR/build
export PATH=$LLVM_INSTALL_DIR/bin:$PATH

난독화 list 조사

snejs — Fri, 2 May 2025 14:01:09 +0900

loop-switch 구조에 효과적일 수 있는 난독화에 대해 조사하고

난독화 사용에 있어 효과적인 난독화 조합에 대해서도 조사하였다.

조사에는 Gpt의 도움을 받았다.

난독화 기법

바이너리 코드에 대한 분석을 하는 것을 감안,

단독으로 사용될 수 있는 난독화보다는 조합적으로 사용할 수 있는 난독화에 대해 조사하였다.

1. EncodeLiterals

- 핵심 효과 : 흐름 제어에 쓰이는 키값들을 정적으로는 해석할 수 없도록 인코딩

- switch case의 분기 키나 VM opcode 식별값이 숨겨지면 flow recovery가 어려움

2. DeadCode

- 핵심 효과 : 실행되지 않는 (죽은) 코드를 프로그램에 삽입해서 혼란스럽게 만듦

- switch 흐름을 직접적으로 교란하진 않지만, flatten된 구조에 간접적으로 사용 시 혼란을 유발 할 수 있음

- 단일로는 효율이 떨어짐

3. Split

- 핵심 효과 : 하나의 함수를 여러 개의 작은 함수로 분할하여 분석 난이도를 증가시킴

- dispatcher 구조처럼 tight loop 안에서 switch가 동작하는 경우에는 split이 적용 안 될 수 있음

난독화 조합

루프 기반 switch 흐름에서 효과적인 난독화

1. state 값 추적 차단 : Opaque Predicate, EncodeLiterals

2. switch 안의 분기 흐름 교란 : DeadCode

3. 분기 순서를 예측하기 어렵게 만듦 : Virtualize, Split