Bug fixes (#249)

* Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update cross_entropy_loss.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * rope * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * llama * Update llama.py * gemma * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update save.py * RoPE * Update llama.py * Update llama.py * Update llama.py * Update gemma.py * correct_dtype * Update gemma.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Chat Templates * Update README.md * Update README.md * Update llama.py * DoRA * Update _utils.py * Update chat_templates.py * Update llama.py * Hotfix - fix DoRA, Gemma prompt template (#202) (#203) * Update save.py * saving * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update __init__.py * Update save.py * Update save.py * Update save.py * save * trainer * spaces * original * Gemma * Update pyproject.toml * Update mapper.py * Update fast_lora.py * FastGemmaModel * model_type * Update llama.py * Update llama.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update llama.py * Update fast_lora.py * Update llama.py * Update llama.py * Update cross_entropy_loss.py * Update llama.py * Update llama.py * gemma * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update fast_lora.py * Update fast_lora.py * Fast CE Loss * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * CE * Update llama.py * Update llama.py * Update cross_entropy_loss.py * Update geglu.py * Update cross_entropy_loss.py * revert * Update llama.py * Update llama.py * norm * Update gemma.py * Update gemma.py * position_ids * Update gemma.py * Update gemma.py * pos * Update llama.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update cross_entropy_loss.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * revert * revert * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update cross_entropy_loss.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * rope * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * llama * Update llama.py * gemma * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update save.py * RoPE * Update llama.py * Update llama.py * Update llama.py * Update gemma.py * correct_dtype * Update gemma.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Chat Templates * Update README.md * Update README.md * Update llama.py * DoRA * Update _utils.py * Update chat_templates.py * Update pyproject.toml * Small fixes * Update pyproject.toml * Approx gelu * Update geglu.py * Approx gelu * Update llama.py * Update __init__.py * Update __init__.py * Update _utils.py * Update geglu.py * Update gemma.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Fix Gemma merging * Update rms_layernorm.py * Update gemma.py * Update pyproject.toml * Layernorms * Gemma precision * Update gemma.py * sqrt * Update gemma.py * Update save.py * RoPE and Gemma precision * Update rms_layernorm.py * Fix warning * Update chat_templates.py * Update chat_templates.py * Update save.py * Update save.py * Update save.py * Update chat_templates.py * Update llama.py * model_name * Update loader.py * Tokenizer overwritten * Update llama.py * Update llama.py * Update llama.py * Update save.py * Accuracy * Revert * Update save.py * Update fast_lora.py * Update fast_lora.py * Update fast_lora.py * Update fast_lora.py * Update fast_lora.py * Update chat_templates.py * Update save.py * Update save.py * Update llama.py * Update llama.py * Account for DoRA * Update llama.py * Update save.py * GGUF incorrect * Update save.py * Update pyproject.toml * kaggle new * Update pyproject.toml * Update pyproject.toml * upcasting * Fix Colab * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update chat_templates.py * Update chat_templates.py * Update chat_templates.py * Update chat_templates.py * Update chat_templates.py * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update rope_embedding.py * Update rope_embedding.py * Fix bugs * Update fast_lora.py * Update fast_lora.py * Update README.md * Update README.md * GGUF * Update save.py * Update save.py * Update save.py * Update save.py * Update README.md * Update README.md
2026-04-21 13:37:39 +00:00 · 2024-03-17 02:47:05 +11:00 · 2024-03-17 02:47:05 +11:00 · c599ae0f27
commit c599ae0f27
parent 39713e66ed
5 changed files with 116 additions and 37 deletions
--- a/README.md
+++ b/README.md
@ -91,13 +91,11 @@ Select either `pytorch-cuda=11.8` for CUDA 11.8 or `pytorch-cuda=12.1` for CUDA
 conda create --name unsloth_env python=3.10
 conda activate unsloth_env

-conda install pytorch cudatoolkit torchvision torchaudio pytorch-cuda=<12.1/11.8> -c pytorch -c nvidia
+conda install pytorch-cuda=<12.1/11.8> pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers

-conda install xformers -c xformers
+pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

-pip install bitsandbytes
-
-pip install "unsloth[conda] @ git+https://github.com/unslothai/unsloth.git"
+pip install --no-deps trl peft accelerate bitsandbytes
 ```

 ### Pip Installation
@ -144,6 +142,22 @@ pip install "unsloth[cu121-ampere-torch220] @ git+https://github.com/unslothai/u
 ```bash
 pip install --upgrade pip
 ```
+6. For Pytorch 2.2.1:
+```bash
+# RTX 3090, 4090 Ampere GPUs:
+pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
+pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
+
+# Pre Ampere RTX 2080, T4, GTX 1080 GPUs:
+pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
+pip install --no-deps xformers trl peft accelerate bitsandbytes
+```
+7. To troubleshoot installs try the below (all must succeed). Xformers should mostly all be available.
+```bash
+nvcc
+python -m xformers.info
+python -m bitsandbytes
+```

 ## 📜 Documentation
 - Go to our [Wiki page](https://github.com/unslothai/unsloth/wiki) for saving to GGUF, checkpointing, evaluation and more!
--- a/unsloth/init.py
+++ b/unsloth/init.py
@ -18,9 +18,9 @@ import importlib
 # Currently only supports 1 GPU, or else seg faults will occur.
 if "CUDA_VISIBLE_DEVICES" in os.environ:
    devices = os.environ["CUDA_VISIBLE_DEVICES"]
-    # check if there are multiple cuda devices set in env
+    # Check if there are multiple cuda devices set in env
    if not devices.isdigit():
-        first_id = devices.split(',')[0]
+        first_id = devices.split(",")[0]
        warnings.warn(
            f"Unsloth: 'CUDA_VISIBLE_DEVICES' is currently {devices} \n"\
            "Multiple CUDA devices detected but we require a single device.\n"\
@ -33,20 +33,29 @@ else:
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 pass

+# Reduce VRAM usage by reducing fragmentation
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+
 try:
    import torch
 except:
    raise ImportError("Pytorch is not installed. Go to https://pytorch.org/.\n"\
                      "We have some installation instructions on our Github page.")

-# We support torch 2.1 and 2.1.1
+# We support Pytorch 2
 # Fixes https://github.com/unslothai/unsloth/issues/38
 torch_version = torch.__version__.split(".")
 major_torch, minor_torch = torch_version[0], torch_version[1]
 major_torch, minor_torch = int(major_torch), int(minor_torch)
-if (major_torch != 2):# or (major_torch == 2 and minor_torch < 1):
-    raise ImportError("Unsloth only supports Pytorch 2.1 for now. Please update your Pytorch to 2.1.\n"\
+if (major_torch < 2):
+    raise ImportError("Unsloth only supports Pytorch 2 for now. Please update your Pytorch to 2.1.\n"\
                      "We have some installation instructions on our Github page.")
+elif (major_torch == 2) and (minor_torch < 2):
+    # Disable expandable_segments
+    del os.environ["PYTORCH_CUDA_ALLOC_CONF"]
+    # Must reimport Pytorch!
+    importlib.reload(torch)
+pass


 # Try loading bitsandbytes and triton
--- a/unsloth/kernels/rope_embedding.py
+++ b/unsloth/kernels/rope_embedding.py
@ -17,6 +17,7 @@ import triton.language as tl
 import torch
 from .utils import calculate_settings

+ROPE_GROUP_SIZE = 4

@triton.heuristics({"BACKWARD_PASS": lambda args: args["BACKWARD_PASS"],})
@triton.jit
@ -24,9 +25,11 @@ def _rope_embedding(
    Q,     Q_row_stride,
    cos, cos_row_stride,
    sin, sin_row_stride,
-    seqlen, head_dim, group_size, n_heads,
-    BACKWARD_PASS: tl.constexpr,
-    BLOCK_SIZE : tl.constexpr,
+    seqlen,
+    head_dim      : tl.constexpr,
+    n_heads       : tl.constexpr,
+    BACKWARD_PASS : tl.constexpr,
+    BLOCK_SIZE    : tl.constexpr,
 ):
    """
        Calculates the RoPE Embedding quickly
@ -49,16 +52,18 @@ def _rope_embedding(
        sin1 = -sin1
    pass

-    head_start = group_head_position * group_size
-    head_end = tl.math.min((head_start + group_size), n_heads)
+    # [TODO] Autotune ROPE_GROUP_SIZE to be 1, 2, 4, 8
+    head_start = group_head_position * ROPE_GROUP_SIZE
+    head_end = min((head_start + ROPE_GROUP_SIZE), n_heads)

-    for i in range(head_start, head_end):
-        offs_q1 = row_position * Q_row_stride + i * head_dim + col_offsets
-        offs_q2 = row_position * Q_row_stride + i * head_dim + col_offsets + half_head_dim
+    # 10% Faster kernel from [HuyNguyen-hust](https://github.com/unslothai/unsloth/pull/238)
+    for k in range(head_start, head_end):
+        offs_q1 = row_position * Q_row_stride + k * head_dim + col_offsets
+        offs_q2 = row_position * Q_row_stride + k * head_dim + col_offsets + half_head_dim

        # For Gemma - sometimes RoPE must be done in float32 and not bfloat16
-        Q1   = tl.load(Q + offs_q1, mask = mask, other = 0).to(sin1.dtype)
-        Q2   = tl.load(Q + offs_q2, mask = mask, other = 0).to(sin1.dtype)
+        Q1 = tl.load(Q + offs_q1, mask = mask, other = 0).to(sin1.dtype)
+        Q2 = tl.load(Q + offs_q2, mask = mask, other = 0).to(sin1.dtype)

        tl.store(Q + offs_q1, Q1*cos1 - Q2*sin1, mask = mask)
        tl.store(Q + offs_q2, Q2*cos1 + Q1*sin1, mask = mask)
@ -78,21 +83,24 @@ class Fast_RoPE_Embedding(torch.autograd.Function):
        # [TODO] Changing blocksize to head_dim//2 seems to have
        # some concurrency / un-deterministic issues.
        BLOCK_SIZE, num_warps = calculate_settings(head_dim//2) # (head_dim//2)
-        group_size = 4 # 4 or 8, too large group_size can hurt performance.
-        n_groups = triton.cdiv(n_heads, group_size)
+        
+        # group_size = 4 # 4 or 8, too large group_size can hurt performance.
+        div, mod = divmod(n_heads, ROPE_GROUP_SIZE)
+        n_groups = div + (mod != 0)

-        grid = (n_rows, n_groups, )
-        _rope_embedding[grid](
+        _rope_embedding[(n_rows, n_groups, )](
              Q,   Q.stride(0),
            cos, cos.stride(0),
            sin, sin.stride(0),
-            seq_len, head_dim, group_size, n_heads,
+            seq_len,
+            head_dim, n_heads,
            BACKWARD_PASS = False,
            BLOCK_SIZE = BLOCK_SIZE,
            num_warps  = num_warps,
        )
        ctx.BLOCK_SIZE = BLOCK_SIZE
        ctx.num_warps  = num_warps
+        ctx.n_groups = n_groups
        ctx.cos = cos
        ctx.sin = sin
        return Q.view(batch, seq_len, n_heads, head_dim)
@ -108,15 +116,11 @@ class Fast_RoPE_Embedding(torch.autograd.Function):
        cos = ctx.cos
        sin = ctx.sin

-        group_size = 4 # 4 or 8, too large group_size can hurt performance.
-        n_groups = triton.cdiv(n_heads, group_size)
-
-        grid = (n_rows, n_groups, )
-        _rope_embedding[grid](
+        _rope_embedding[(n_rows, ctx.n_groups, )](
            dY,  dY .stride(0),
            cos, cos.stride(0),
            sin, sin.stride(0),
-            seq_len, head_dim, group_size, n_heads,
+            seq_len, head_dim, n_heads,
            BACKWARD_PASS = True,
            BLOCK_SIZE = ctx.BLOCK_SIZE,
            num_warps  = ctx.num_warps,
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@ -18,6 +18,8 @@ import warnings
 warnings.filterwarnings(action = "ignore", category = UserWarning, module = "torch")
 warnings.filterwarnings(action = "ignore", category = UserWarning, module = "huggingface_hub")
 warnings.filterwarnings(action = "ignore", category = RuntimeWarning, module = "subprocess")
+warnings.filterwarnings(action = "ignore", category = UserWarning, module = "transformers")
+warnings.filterwarnings(action = "ignore", category = FutureWarning, module = "accelerate")
 import bitsandbytes as bnb
 from transformers.models.llama.modeling_llama import logger
 from transformers import AutoTokenizer
--- a/unsloth/save.py
+++ b/unsloth/save.py
@ -593,16 +593,17 @@ def install_llama_cpp_old(version = -10):
    pass

    # Clone a specific commit
+    # Also don't use the GPU!
    commands = [
        "git clone https://github.com/ggerganov/llama.cpp",
        f"cd llama.cpp && git reset --hard {version} && git clean -df && "\
-        f"make clean && LLAMA_CUBLAS=1 make all -j{psutil.cpu_count()*2}",
+        f"make clean make all -j{psutil.cpu_count()*2}",
        "pip install gguf protobuf",
    ]
    for command in commands:
        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, bufsize = 1) as sp:
            for line in sp.stdout:
-                print(line.decode("utf-8"), flush = True, end = "")
+                print(line.decode("utf-8", errors = "replace"), flush = True, end = "")
        pass
    pass
    # Check if successful
@ -625,12 +626,55 @@ def install_llama_cpp_blocking():
    for command in commands:
        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, bufsize = 1) as sp:
            for line in sp.stdout:
-                print(line.decode("utf-8"), flush = True, end = "")
+                print(line.decode("utf-8", errors = "replace"), flush = True, end = "")
        pass
    pass
 pass


+def _fix_gemma_gguf():
+    # Fixes Gemma saving to GGUF to float32 instead of float16!
+    with open("llama.cpp/convert-hf-to-gguf.py", "rb") as file:
+        text = file.read()
+    pass
+
+    gemma_start = text.find(b"class GemmaModel(Model):")
+    if gemma_start == -1: return
+
+    gemma_end   = text.find(b"self.gguf_writer.add_tensor(new_name, data)", gemma_start)
+    if gemma_end == -1: return
+
+    gemma_text = text[gemma_start : gemma_end]
+    bad_text = \
+b"""         data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)"""
+    good_text = \
+b"""         # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)"""
+    find_bad = gemma_text.find(bad_text)
+    if find_bad == -1: return
+
+    gemma_text = gemma_text[:find_bad] + good_text + gemma_text[find_bad + len(bad_text):]
+    text = text[:gemma_start] + gemma_text + text[gemma_end:]
+
+    with open("llama.cpp/convert-hf-to-gguf.py", "w+b") as file:
+        file.write(text)
+    pass
+pass
+
+
 def save_to_gguf(
    model_type           : str,
    model_directory      : str = "unsloth_finetuned_model",
@ -686,7 +730,10 @@ def save_to_gguf(
        install_llama_cpp_blocking()
    pass
    # Check if successful. If not install 10th latest release
-    if error != 0 or not os.path.exists("llama.cpp/quantize"): install_llama_cpp_old(-10)
+    if error != 0 or not os.path.exists("llama.cpp/quantize"):
+        print(f"Unsloth: llama.cpp error code = {error}.")
+        install_llama_cpp_old(-10)
+    pass

    if   quantization_method == "f32":  first_conversion = "f32"
    elif quantization_method == "f16":  first_conversion = "f16"
@ -723,6 +770,9 @@ def save_to_gguf(
            f"--outfile {final_location} --vocab-type hfft "\
            f"--outtype {first_conversion} --concurrency {n_cpus}"
    else:
+        # Need to fix convert-hf-to-gguf.py for some models!
+        _fix_gemma_gguf()
+
        command = f"python llama.cpp/convert-hf-to-gguf.py {model_directory} "\
            f"--outfile {final_location} "\
            f"--outtype {first_conversion}"
@ -730,7 +780,7 @@ def save_to_gguf(

    with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE, bufsize = 1) as sp:
        for line in sp.stdout:
-            print(line.decode("utf-8"), flush = True, end = "")
+            print(line.decode("utf-8", errors = "replace"), flush = True, end = "")
        if sp.returncode is not None and sp.returncode != 0:
            raise subprocess.CalledProcessError(sp.returncode, sp.args)
    pass
@ -760,7 +810,7 @@ def save_to_gguf(
        # quantize uses stderr
        with subprocess.Popen(command, shell = True, stderr = subprocess.PIPE, bufsize = 1) as sp:
            for line in sp.stderr:
-                print(line.decode("utf-8"), flush = True, end = "")
+                print(line.decode("utf-8", errors = "replace"), flush = True, end = "")
            if sp.returncode is not None and sp.returncode != 0:
                raise subprocess.CalledProcessError(sp.returncode, sp.args)
        pass