Merge pull request #110 from iryna-kondr/gguf

replaced gpt4all with llama-cpp-python
iryna-kondr · Aug 4, 2024 · 261f501 · 261f501
2 parents a9c29c4 + 10c09d2
commit 261f501
Show file tree

Hide file tree

Showing 10 changed files with 242 additions and 70 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "google-cloud-aiplatform[pipelines]>=1.27.0,<2.0.0"
 ]
 name = "scikit-llm"
-version = "1.3.1"
+version = "1.4.0"
 authors = [
   { name="Oleh Kostromin", email="kostromin97@gmail.com" },
   { name="Iryna Kondrashchenko", email="iryna230520@gmail.com" },
@@ -27,7 +27,7 @@ classifiers = [
 ]
 
 [project.optional-dependencies]
-gpt4all = ["gpt4all>=2.0.0,<3.0.0"]
+gguf = ["llama-cpp-python>=0.2.82,<0.2.83"]
 annoy = ["annoy>=1.17.2,<2.0.0"]
 
 [tool.ruff]

diff --git a/skllm/__init__.py b/skllm/__init__.py
@@ -1,2 +1,2 @@
-__version__ = '1.3.1'
+__version__ = '1.4.0'
 __author__ = 'Iryna Kondrashchenko, Oleh Kostromin'
diff --git a/skllm/config.py b/skllm/config.py
@@ -7,6 +7,9 @@
 _AZURE_API_VERSION_VAR = "SKLLM_CONFIG_AZURE_API_VERSION"
 _GOOGLE_PROJECT = "GOOGLE_CLOUD_PROJECT"
 _GPT_URL_VAR = "SKLLM_CONFIG_GPT_URL"
+_GGUF_DOWNLOAD_PATH = "SKLLM_CONFIG_GGUF_DOWNLOAD_PATH"
+_GGUF_MAX_GPU_LAYERS = "SKLLM_CONFIG_GGUF_MAX_GPU_LAYERS"
+_GGUF_VERBOSE = "SKLLM_CONFIG_GGUF_VERBOSE"
 
 
 class SKLLMConfig:
@@ -169,4 +172,36 @@ def get_gpt_url() -> Optional[str]:
     @staticmethod
     def reset_gpt_url():
         """Resets the GPT URL."""
-        os.environ.pop(_GPT_URL_VAR, None)
+        os.environ.pop(_GPT_URL_VAR, None)
+
+    @staticmethod
+    def get_gguf_download_path() -> str:
+        """Gets the path to store the downloaded GGUF files."""
+        default_path = os.path.join(os.path.expanduser("~"), ".skllm", "gguf")
+        return os.environ.get(_GGUF_DOWNLOAD_PATH, default_path)
+
+    @staticmethod
+    def get_gguf_max_gpu_layers() -> int:
+        """Gets the maximum number of layers to use for the GGUF model."""
+        return int(os.environ.get(_GGUF_MAX_GPU_LAYERS, 0))
+
+    @staticmethod
+    def set_gguf_max_gpu_layers(n_layers: int):
+        """Sets the maximum number of layers to use for the GGUF model."""
+        if not isinstance(n_layers, int):
+            raise ValueError("n_layers must be an integer")
+        if n_layers < -1:
+            n_layers = -1
+        os.environ[_GGUF_MAX_GPU_LAYERS] = str(n_layers)
+
+    @staticmethod
+    def set_gguf_verbose(verbose: bool):
+        """Sets the verbosity of the GGUF model."""
+        if not isinstance(verbose, bool):
+            raise ValueError("verbose must be a boolean")
+        os.environ[_GGUF_VERBOSE] = str(verbose)
+
+    @staticmethod
+    def get_gguf_verbose() -> bool:
+        """Gets the verbosity of the GGUF model."""
+        return os.environ.get(_GGUF_VERBOSE, "False").lower() == "true"
diff --git a/skllm/llm/gpt/clients/gpt4all/completion.py b/skllm/llm/gpt/clients/gpt4all/completion.py
diff --git a/skllm/llm/gpt/clients/llama_cpp/completion.py b/skllm/llm/gpt/clients/llama_cpp/completion.py
@@ -0,0 +1,12 @@
+from skllm.llm.gpt.clients.llama_cpp.handler import ModelCache, LlamaHandler
+
+
+def get_chat_completion(messages: dict, model: str, **kwargs):
+
+    with ModelCache.lock:
+        handler = ModelCache.get(model)
+        if handler is None:
+            handler = LlamaHandler(model)
+            ModelCache.store(model, handler)
+
+    return handler.get_chat_completion(messages, **kwargs)
diff --git a/skllm/llm/gpt/clients/llama_cpp/handler.py b/skllm/llm/gpt/clients/llama_cpp/handler.py
@@ -0,0 +1,164 @@
+import threading
+import os
+import hashlib
+import requests
+from tqdm import tqdm
+import hashlib
+from typing import Optional
+import tempfile
+from skllm.config import SKLLMConfig
+from warnings import warn
+
+
+try:
+    from llama_cpp import Llama as _Llama
+
+    _llama_imported = True
+except (ImportError, ModuleNotFoundError):
+    _llama_imported = False
+
+
+supported_models = {
+    "llama3-8b-q4": {
+        "download_url": "https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf",
+        "sha256": "c57380038ea85d8bec586ec2af9c91abc2f2b332d41d6cf180581d7bdffb93c1",
+        "n_ctx": 8192,
+        "supports_system_message": True,
+    },
+    "gemma2-9b-q4": {
+        "download_url": "https://huggingface.co/bartowski/gemma-2-9b-it-GGUF/resolve/main/gemma-2-9b-it-Q4_K_M.gguf",
+        "sha256": "13b2a7b4115bbd0900162edcebe476da1ba1fc24e718e8b40d32f6e300f56dfe",
+        "n_ctx": 8192,
+        "supports_system_message": False,
+    },
+    "phi3-mini-q4": {
+        "download_url": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf",
+        "sha256": "8a83c7fb9049a9b2e92266fa7ad04933bb53aa1e85136b7b30f1b8000ff2edef",
+        "n_ctx": 4096,
+        "supports_system_message": False,
+    },
+    "mistral0.3-7b-q4": {
+        "download_url": "https://huggingface.co/lmstudio-community/Mistral-7B-Instruct-v0.3-GGUF/resolve/main/Mistral-7B-Instruct-v0.3-Q4_K_M.gguf",
+        "sha256": "1270d22c0fbb3d092fb725d4d96c457b7b687a5f5a715abe1e818da303e562b6",
+        "n_ctx": 32768,
+        "supports_system_message": False,
+    },
+    "gemma2-2b-q6": {
+        "download_url": "https://huggingface.co/bartowski/gemma-2-2b-it-GGUF/resolve/main/gemma-2-2b-it-Q6_K_L.gguf",
+        "sha256": "b2ef9f67b38c6e246e593cdb9739e34043d84549755a1057d402563a78ff2254",
+        "n_ctx": 8192,
+        "supports_system_message": False,
+    },
+}
+
+
+class LlamaHandler:
+
+    def maybe_download_model(self, model_name, download_url, sha256) -> str:
+        download_folder = SKLLMConfig.get_gguf_download_path()
+        os.makedirs(download_folder, exist_ok=True)
+        model_name = model_name + ".gguf"
+        model_path = os.path.join(download_folder, model_name)
+        if not os.path.exists(model_path):
+            print("The model `{0}` is not found locally.".format(model_name))
+            self._download_model(model_name, download_folder, download_url, sha256)
+        return model_path
+
+    def _download_model(
+        self, model_filename: str, model_path: str, url: str, expected_sha256: str
+    ) -> str:
+        full_path = os.path.join(model_path, model_filename)
+        temp_file = tempfile.NamedTemporaryFile(delete=False, dir=model_path)
+        temp_path = temp_file.name
+        temp_file.close()
+
+        response = requests.get(url, stream=True)
+
+        if response.status_code != 200:
+            os.remove(temp_path)
+            raise ValueError(
+                f"Request failed: HTTP {response.status_code} {response.reason}"
+            )
+
+        total_size_in_bytes = int(response.headers.get("content-length", 0))
+        block_size = 1024 * 1024 * 4
+
+        sha256 = hashlib.sha256()
+
+        with (
+            open(temp_path, "wb") as file,
+            tqdm(
+                desc="Downloading {0}: ".format(model_filename),
+                total=total_size_in_bytes,
+                unit="iB",
+                unit_scale=True,
+            ) as progress_bar,
+        ):
+            for data in response.iter_content(block_size):
+                file.write(data)
+                sha256.update(data)
+                progress_bar.update(len(data))
+
+        downloaded_sha256 = sha256.hexdigest()
+        if downloaded_sha256 != expected_sha256:
+            raise ValueError(
+                f"Expected SHA-256 hash {expected_sha256}, but got {downloaded_sha256}"
+            )
+
+        os.rename(temp_path, full_path)
+
+    def __init__(self, model: str):
+        if not _llama_imported:
+            raise ImportError(
+                "llama_cpp is not installed, try `pip install scikit-llm[llama_cpp]`"
+            )
+        self.lock = threading.Lock()
+        if model not in supported_models:
+            raise ValueError(f"Model {model} is not supported.")
+        download_url = supported_models[model]["download_url"]
+        sha256 = supported_models[model]["sha256"]
+        n_ctx = supported_models[model]["n_ctx"]
+        self.supports_system_message = supported_models[model][
+            "supports_system_message"
+        ]
+        if not self.supports_system_message:
+            warn(
+                f"The model {model} does not support system messages. This may cause issues with some estimators."
+            )
+        extended_model_name = model + "-" + sha256[:8]
+        model_path = self.maybe_download_model(
+            extended_model_name, download_url, sha256
+        )
+        max_gpu_layers = SKLLMConfig.get_gguf_max_gpu_layers()
+        verbose = SKLLMConfig.get_gguf_verbose()
+        self.model = _Llama(
+            model_path=model_path,
+            n_ctx=n_ctx,
+            verbose=verbose,
+            n_gpu_layers=max_gpu_layers,
+        )
+
+    def get_chat_completion(self, messages: dict, **kwargs):
+        if not self.supports_system_message:
+            messages = [m for m in messages if m["role"] != "system"]
+        with self.lock:
+            return self.model.create_chat_completion(
+                messages, temperature=0.0, **kwargs
+            )
+
+
+class ModelCache:
+    lock = threading.Lock()
+    cache: dict[str, LlamaHandler] = {}
+
+    @classmethod
+    def get(cls, key) -> Optional[LlamaHandler]:
+        return cls.cache.get(key, None)
+
+    @classmethod
+    def store(cls, key, value):
+        cls.cache[key] = value
+
+    @classmethod
+    def clear(cls):
+        cls.cache = {}
diff --git a/skllm/llm/gpt/completion.py b/skllm/llm/gpt/completion.py
@@ -2,12 +2,13 @@
 from skllm.llm.gpt.clients.openai.completion import (
     get_chat_completion as _oai_get_chat_completion,
 )
-from skllm.llm.gpt.clients.gpt4all.completion import (
-    get_chat_completion as _g4a_get_chat_completion,
+from skllm.llm.gpt.clients.llama_cpp.completion import (
+    get_chat_completion as _llamacpp_get_chat_completion,
 )
 from skllm.llm.gpt.utils import split_to_api_and_model
 from skllm.config import SKLLMConfig as _Config
 
+
 def get_chat_completion(
     messages: dict,
     openai_key: str = None,
@@ -17,14 +18,18 @@ def get_chat_completion(
 ):
     """Gets a chat completion from the OpenAI compatible API."""
     api, model = split_to_api_and_model(model)
-    if api == "gpt4all":
-        return _g4a_get_chat_completion(messages, model)
+    if api == "gguf":
+        return _llamacpp_get_chat_completion(messages, model)
     else:
         url = _Config.get_gpt_url()
         if api == "openai" and url is not None:
-            warnings.warn(f"You are using the OpenAI backend with a custom URL: {url}; did you mean to use the `custom_url` backend?\nTo use the OpenAI backend, please remove the custom URL using `SKLLMConfig.reset_gpt_url()`.")
+            warnings.warn(
+                f"You are using the OpenAI backend with a custom URL: {url}; did you mean to use the `custom_url` backend?\nTo use the OpenAI backend, please remove the custom URL using `SKLLMConfig.reset_gpt_url()`."
+            )
         elif api == "custom_url" and url is None:
-            raise ValueError("You are using the `custom_url` backend but no custom URL was provided. Please set it using `SKLLMConfig.set_gpt_url(<url>)`.")
+            raise ValueError(
+                "You are using the `custom_url` backend but no custom URL was provided. Please set it using `SKLLMConfig.set_gpt_url(<url>)`."
+            )
         return _oai_get_chat_completion(
             messages,
             openai_key,

diff --git a/skllm/llm/gpt/mixin.py b/skllm/llm/gpt/mixin.py
@@ -81,6 +81,12 @@ def _get_openai_key(self) -> str:
         key = self.key
         if key is None:
             key = _Config.get_openai_key()
+        if (
+            hasattr(self, "model")
+            and isinstance(self.model, str)
+            and self.model.startswith("gguf::")
+        ):
+            key = "gguf"
         if key is None:
             raise RuntimeError("OpenAI key was not found")
         return key

diff --git a/skllm/llm/gpt/utils.py b/skllm/llm/gpt/utils.py
@@ -1,6 +1,6 @@
 from typing import Tuple
 
-SUPPORTED_APIS = ["openai", "azure", "gpt4all", "custom_url"]
+SUPPORTED_APIS = ["openai", "azure", "gguf", "custom_url"]
 
 
 def split_to_api_and_model(model: str) -> Tuple[str, str]:
@@ -9,4 +9,4 @@ def split_to_api_and_model(model: str) -> Tuple[str, str]:
     for api in SUPPORTED_APIS:
         if model.startswith(f"{api}::"):
             return api, model[len(api) + 2 :]
-    raise ValueError(f"Unsupported API: {model.split('::')[0]}")
+    raise ValueError(f"Unsupported API: {model.split('::')[0]}")
diff --git a/skllm/models/_base/classifier.py b/skllm/models/_base/classifier.py
@@ -239,10 +239,14 @@ def predict(self, X: Union[np.ndarray, pd.Series, List[str]], num_workers: int =
             warnings.warn(
                 "Passing num_workers to predict is temporary and will be removed in the future."
             )
-        with ThreadPoolExecutor(max_workers=num_workers) as executor:
-            predictions = list(
-                tqdm(executor.map(self._predict_single, X), total=len(X))
-            )
+            with ThreadPoolExecutor(max_workers=num_workers) as executor:
+                predictions = list(
+                    tqdm(executor.map(self._predict_single, X), total=len(X))
+                )
+        else:
+            predictions = []
+            for x in tqdm(X):
+                predictions.append(self._predict_single(x))
 
         return np.array(predictions)