Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 122 additions & 0 deletions build_windows_12700k.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
@echo off
REM ============================================================
REM Build bitnet.cpp for Windows x86_64 (Intel 12700K / AVX2)
REM ============================================================
REM
REM Prerequisites:
REM 1. Visual Studio 2022 with "Desktop development with C++" workload
REM 2. LLVM/Clang >= 18: https://github.com/llvm/llvm-project/releases
REM 3. CMake >= 3.22: https://cmake.org/download/
REM 4. Ninja: https://github.com/ninja-build/ninja/releases
REM 5. Python >= 3.9 with conda
REM
REM After installing VS 2022, run this from a "Developer Command Prompt for VS 2022"
REM or "x64 Native Tools Command Prompt for VS 2022"
REM
REM Usage:
REM build_windows_12700k.bat
REM ============================================================

echo.
echo === BitNet TL2 Build for Intel 12700K (AVX2) ===
echo.

REM Check prerequisites
where clang >nul 2>&1
if errorlevel 1 (
echo ERROR: clang not found. Install LLVM/Clang ^>= 18 and add to PATH.
echo Download: https://github.com/llvm/llvm-project/releases
exit /b 1
)

where cmake >nul 2>&1
if errorlevel 1 (
echo ERROR: cmake not found. Install CMake ^>= 3.22.
exit /b 1
)

where ninja >nul 2>&1
if errorlevel 1 (
echo WARNING: ninja not found, will use default generator.
echo For faster builds, install Ninja: https://github.com/ninja-build/ninja/releases
set GENERATOR=-G "Visual Studio 17 2022" -T ClangCL
) else (
set GENERATOR=-G Ninja
)

REM Initialize submodule if needed
if not exist "3rdparty\llama.cpp\CMakeLists.txt" (
echo Initializing submodule...
git submodule update --init
)

REM Apply the ggml NaN guard patch to the submodule
if exist "nan-guards-ggml.patch" (
echo Applying NaN guard patch to llama.cpp submodule...
pushd 3rdparty\llama.cpp
git apply ..\..\nan-guards-ggml.patch 2>nul
popd
)

REM Step 1: Generate TL2 kernels (uses Python)
echo.
echo === Step 1: Generating TL2 kernels ===
python utils\codegen_tl2.py --model bitnet_b1_58-2B --BM 256 --BK 96 --bm 32
if errorlevel 1 (
echo WARNING: Kernel generation failed. Using existing kernels if available.
)

REM Step 2: CMake configure
echo.
echo === Step 2: CMake Configure ===
if not exist "build_win" mkdir build_win
cd build_win

cmake .. %GENERATOR% ^
-DCMAKE_C_COMPILER=clang ^
-DCMAKE_CXX_COMPILER=clang++ ^
-DGGML_BITNET_X86_TL2=ON ^
-DCMAKE_BUILD_TYPE=Release

if errorlevel 1 (
echo ERROR: CMake configure failed.
cd ..
exit /b 1
)

REM Step 3: Build
echo.
echo === Step 3: Building ===
cmake --build . --config Release -j %NUMBER_OF_PROCESSORS%

if errorlevel 1 (
echo ERROR: Build failed.
cd ..
exit /b 1
)

cd ..

echo.
echo ============================================================
echo BUILD COMPLETE
echo ============================================================
echo.
echo Binaries are in: build_win\bin\Release\ (or build_win\bin\)
echo.
echo To run inference:
echo build_win\bin\llama-cli.exe -m PATH\TO\bitnet_v2_TL2.gguf ^
echo -p "What is an SBOM?" -n 200 --temp 0.7 -t 8
echo.
echo Interactive mode:
echo build_win\bin\llama-cli.exe -m PATH\TO\bitnet_v2_TL2.gguf ^
echo -i -p "<|im_start|>assistant" ^
echo --in-prefix "<|im_start|>user\n" ^
echo --in-suffix "<|im_end|>\n<|im_start|>assistant\n" ^
echo -n 512 --temp 0.7 --repeat-penalty 1.2 -t 8 ^
echo -r "<|im_end|>"
echo.
echo Server mode (access from any device):
echo build_win\bin\llama-server.exe -m PATH\TO\bitnet_v2_TL2.gguf ^
echo --host 0.0.0.0 --port 8080 -t 8 -c 4096
echo.
63 changes: 63 additions & 0 deletions nan-guards-ggml.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 127c6bcd..e0f1350d 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -3508,7 +3508,7 @@ void quantize_row_i8_s(const float * x, void * y, int64_t n, float* act_scales,
for (int i = 0; i < n; ++i) {
max = MAX(max, (double)fabs((double)x[i]));
}
- float s = 127 / max;
+ float s = (max > 1e-10) ? (float)(127.0 / max) : 0.0f;
act_scales[0] = s;
int32_t sum = 0;
for (int i = 0; i < n; ++i) {
@@ -3530,7 +3530,7 @@ void quantize_row_i8_s_4x1(const float * x, void * y, int64_t n, float* act_scal
for (int i = 0; i < n; ++i) {
max = MAX(max, (double)fabs((double)x[i]));
}
- float s = 127 / max;
+ float s = (max > 1e-10) ? (float)(127.0 / max) : 0.0f;
act_scales[0] = s;
int32_t sum = 0;
for (int i = 0; i < n / ACT_K_PACK_SIZE; ++i) {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 121f72da..5a5e68fc 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -12510,7 +12510,8 @@ static void ggml_compute_forward_mul_mat_one_chunk(

// post compute activation scaling
for (int row = 0; row < 16; row++) {
- tmp[row] = (tmp[row] - act_sums[i1]) / (act_scales[i1]) * (*scale);
+ float as = act_scales[i1];
+ tmp[row] = (as != 0.0f) ? (tmp[row] - act_sums[i1]) / as * (*scale) : 0.0f;
}
}
else
@@ -12518,7 +12519,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
if (src0->type == GGML_TYPE_I2_S) {
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0 * nb01 / 4, 0, src1_col_de, 0, 1);
- tmp[ir0 - iir0] = (tmp[ir0 - iir0] - act_sums[i1]) / (act_scales[i1]) * (*scale);
+ { float as = act_scales[i1]; tmp[ir0 - iir0] = (as != 0.0f) ? (tmp[ir0 - iir0] - act_sums[i1]) / as * (*scale) : 0.0f; }
} else {
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
}
@@ -13266,7 +13267,7 @@ UseGgmlGemm2:;
(const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
for (int col = 0; col < ne11 - ne11 % 4; col++) {
for (int row = 0; row < src0_end - src0_start; row++) {
- tmp[col * (src0_end - src0_start) + row] = (tmp[col * (src0_end - src0_start) + row] - act_sums[col]) / (act_scales[col]) * (*scale);
+ { float as = act_scales[col]; tmp[col * (src0_end - src0_start) + row] = (as != 0.0f) ? (tmp[col * (src0_end - src0_start) + row] - act_sums[col]) / as * (*scale) : 0.0f; }
}
memcpy((float *)((char *) dst->data + (col * nb1)) + src0_start, tmp + col * (src0_end - src0_start), (src0_end - src0_start) * sizeof(float));
}
@@ -13287,7 +13288,7 @@ UseGgmlGemm2:;
(const char *) src1_wdata + (src1_col_stride * iter),
1, src0_end - src0_start);
for (int row = 0; row < src0_end - src0_start; row++) {
- tmp[row] = (tmp[row] - act_sums[iter]) / (act_scales[iter]) * (*scale);
+ { float as = act_scales[iter]; tmp[row] = (as != 0.0f) ? (tmp[row] - act_sums[iter]) / as * (*scale) : 0.0f; }
}
memcpy((float *)((char *) dst->data + (iter * nb1)) + src0_start, tmp, (src0_end - src0_start) * sizeof(float));
}
62 changes: 32 additions & 30 deletions utils/codegen_tl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,9 @@ def gen_ctor_code():
__m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec));\n\
max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1));\n\
max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1));\n\
float scales = 127 / _mm_cvtss_f32(max1);\n\
float max_val = _mm_cvtss_f32(max1);\n\
if (max_val < 1e-10f) max_val = 1e-10f;\n\
float scales = 127.0f / max_val;\n\
*lut_scales = scales;\n\
#endif\n\
return 0;\n\
Expand Down Expand Up @@ -490,7 +492,7 @@ def gen_tbl_impl(pre, BM, BK, bm, k_list):
}}\n\
\n\
template<int BATCH_SIZE>\n\
int32_t three_qgemm_lut_{0}(void* A, void* sign, void* LUT, void* Scales, void* LUT_Scales, void* C) {{\n\
int32_t three_qgemm_lut_{0}(void* A, void* sign, void* LUT, void* Scales, void* LUT_Scales, void* C, int out_stride) {{\n\
alignas(32) uint32_t CBits[BATCH_SIZE * BM{0}];\n\
memset(&(CBits[0]), 0, BATCH_SIZE * BM{0} * sizeof(int32_t));\n\
#pragma unroll\n\
Expand All @@ -501,14 +503,14 @@ def gen_tbl_impl(pre, BM, BK, bm, k_list):
for (int bs = 0; bs < BATCH_SIZE; bs++) {{\n\
#pragma unroll\n\
for (int i = 0; i < BM{0}; i++) {{\n\
((int32_t*)C)[i] = (int32_t)(((int32_t*)CBits)[i + bs * BM{0}]);\n\
((int32_t*)C)[bs * out_stride + i] = (int32_t)(((int32_t*)CBits)[i + bs * BM{0}]);\n\
}}\n\
}}\n\
return 0;\n\
}}\n\
\n\
template<int BATCH_SIZE>\n\
int32_t two_qgemm_lut_{0}(void* A, void* LUT, void* Scales, void* LUT_Scales, void* C) {{\n\
int32_t two_qgemm_lut_{0}(void* A, void* LUT, void* Scales, void* LUT_Scales, void* C, int out_stride) {{\n\
alignas(32) uint32_t CBits[BATCH_SIZE * BM{0}];\n\
memset(&(CBits[0]), 0, BATCH_SIZE * BM{0} * sizeof(int32_t));\n\
#pragma unroll\n\
Expand All @@ -519,8 +521,8 @@ def gen_tbl_impl(pre, BM, BK, bm, k_list):
for (int bs = 0; bs < BATCH_SIZE; bs++) {{\n\
#pragma unroll\n\
for (int i = 0; i < BM{0}; i++) {{\n\
((int32_t*)C)[i] += (int32_t)(((int32_t*)CBits)[i + bs * BM{0}]);\n\
((float*)C)[i] = (float)(((int32_t*)C)[i]) / ((float*)LUT_Scales)[bs] * ((float*)Scales)[0];\n\
((int32_t*)C)[bs * out_stride + i] += (int32_t)(((int32_t*)CBits)[i + bs * BM{0}]);\n\
{{ float ls = ((float*)LUT_Scales)[bs]; ((float*)C)[bs * out_stride + i] = (ls != 0.0f) ? (float)(((int32_t*)C)[bs * out_stride + i]) / ls * ((float*)Scales)[0] : 0.0f; }}\n\
}}\n\
}}\n\
return 0;\n\
Expand Down Expand Up @@ -556,32 +558,32 @@ def gen_top_api(kernel_shapes, k_list):
if (m == {0} && k == {1}) {{\n\
if (BK == {2}) {{\n\
if (bs == 1) {{\n\
two_qgemm_lut_{4}<1>(A, LUT, Scales, LUT_Scales, C);\n\
two_qgemm_lut_{4}<1>(A, LUT, Scales, LUT_Scales, C, m);\n\
}} else if (bs == 8) {{\n\
two_qgemm_lut_{4}<8>(A, LUT, Scales, LUT_Scales, C);\n\
two_qgemm_lut_{4}<8>(A, LUT, Scales, LUT_Scales, C, m);\n\
}} else if (bs == 32) {{\n\
two_qgemm_lut_{4}<32>(A, LUT, Scales, LUT_Scales, C);\n\
two_qgemm_lut_{4}<32>(A, LUT, Scales, LUT_Scales, C, m);\n\
}} else if (bs == 128) {{\n\
two_qgemm_lut_{4}<128>(A, LUT, Scales, LUT_Scales, C);\n\
two_qgemm_lut_{4}<128>(A, LUT, Scales, LUT_Scales, C, m);\n\
}} else if (bs == 256) {{\n\
two_qgemm_lut_{4}<256>(A, LUT, Scales, LUT_Scales, C);\n\
two_qgemm_lut_{4}<256>(A, LUT, Scales, LUT_Scales, C, m);\n\
}} else if (bs == 512) {{\n\
two_qgemm_lut_{4}<512>(A, LUT, Scales, LUT_Scales, C);\n\
two_qgemm_lut_{4}<512>(A, LUT, Scales, LUT_Scales, C, m);\n\
}}\n\
}}\n\
else if (BK == {3}) {{\n\
if (bs == 1) {{\n\
three_qgemm_lut_{4}<1>(A, sign, LUT, Scales, LUT_Scales, C);\n\
three_qgemm_lut_{4}<1>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
}}else if (bs == 8) {{\n\
three_qgemm_lut_{4}<8>(A, sign, LUT, Scales, LUT_Scales, C);\n\
three_qgemm_lut_{4}<8>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
}}else if (bs == 32) {{\n\
three_qgemm_lut_{4}<32>(A, sign, LUT, Scales, LUT_Scales, C);\n\
three_qgemm_lut_{4}<32>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
}}else if (bs == 128) {{\n\
three_qgemm_lut_{4}<128>(A, sign, LUT, Scales, LUT_Scales, C);\n\
three_qgemm_lut_{4}<128>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
}}else if (bs == 256) {{\n\
three_qgemm_lut_{4}<256>(A, sign, LUT, Scales, LUT_Scales, C);\n\
three_qgemm_lut_{4}<256>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
}}else if (bs == 512) {{\n\
three_qgemm_lut_{4}<512>(A, sign, LUT, Scales, LUT_Scales, C);\n\
three_qgemm_lut_{4}<512>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
}}\n\
}}\n\
}}\n\
Expand All @@ -590,32 +592,32 @@ def gen_top_api(kernel_shapes, k_list):
kernel_code = "".join([kernel_code, " else if (m == {0} && k == {1}) {{\n\
if (BK == {2}) {{\n\
if (bs == 1) {{\n\
two_qgemm_lut_{4}<1>(A, LUT, Scales, LUT_Scales, C);\n\
two_qgemm_lut_{4}<1>(A, LUT, Scales, LUT_Scales, C, m);\n\
}} else if (bs == 8) {{\n\
two_qgemm_lut_{4}<8>(A, LUT, Scales, LUT_Scales, C);\n\
two_qgemm_lut_{4}<8>(A, LUT, Scales, LUT_Scales, C, m);\n\
}} else if (bs == 32) {{\n\
two_qgemm_lut_{4}<32>(A, LUT, Scales, LUT_Scales, C);\n\
two_qgemm_lut_{4}<32>(A, LUT, Scales, LUT_Scales, C, m);\n\
}} else if (bs == 128) {{\n\
two_qgemm_lut_{4}<128>(A, LUT, Scales, LUT_Scales, C);\n\
two_qgemm_lut_{4}<128>(A, LUT, Scales, LUT_Scales, C, m);\n\
}} else if (bs == 256) {{\n\
two_qgemm_lut_{4}<256>(A, LUT, Scales, LUT_Scales, C);\n\
two_qgemm_lut_{4}<256>(A, LUT, Scales, LUT_Scales, C, m);\n\
}} else if (bs == 512) {{\n\
two_qgemm_lut_{4}<512>(A, LUT, Scales, LUT_Scales, C);\n\
two_qgemm_lut_{4}<512>(A, LUT, Scales, LUT_Scales, C, m);\n\
}}\n\
}}\n\
else if (BK == {3}) {{\n\
if (bs == 1) {{\n\
three_qgemm_lut_{4}<1>(A, sign, LUT, Scales, LUT_Scales, C);\n\
three_qgemm_lut_{4}<1>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
}}else if (bs == 8) {{\n\
three_qgemm_lut_{4}<8>(A, sign, LUT, Scales, LUT_Scales, C);\n\
three_qgemm_lut_{4}<8>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
}}else if (bs == 32) {{\n\
three_qgemm_lut_{4}<32>(A, sign, LUT, Scales, LUT_Scales, C);\n\
three_qgemm_lut_{4}<32>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
}}else if (bs == 128) {{\n\
three_qgemm_lut_{4}<128>(A, sign, LUT, Scales, LUT_Scales, C);\n\
three_qgemm_lut_{4}<128>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
}}else if (bs == 256) {{\n\
three_qgemm_lut_{4}<256>(A, sign, LUT, Scales, LUT_Scales, C);\n\
three_qgemm_lut_{4}<256>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
}}else if (bs == 512) {{\n\
three_qgemm_lut_{4}<512>(A, sign, LUT, Scales, LUT_Scales, C);\n\
three_qgemm_lut_{4}<512>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
}}\n\
}}\n\
}}\n\
Expand Down