{ pkgs, ... }: let vars = import ./vars.nix; in { services = { ollama = { enable = true; package = pkgs.ollama; syncModels = true; loadModels = [ "deepseek-r1:1.5b" "deepseek-r1:32b" "deepseek-r1:70b" #"qwen3" #"qwen3.5:latest" "qwen3-coder-next" "lennyerik/zeta" "nomic-embed-text:latest" "lfm2:24b" "glm-4.7-flash" "nemotron-cascade-2:30b" "magistral" "devstral-small-2" ]; models = vars.primary_ollama; environmentVariables = { FLASH_ATTENTION = "1"; OLLAMA_KV_CACHE_TYPE = "q8_0"; # Ollama memory configuration OLLAMA_MAX_LOADED_MODELS = "3"; OLLAMA_MAX_QUEUE = "512"; OLLAMA_NUM_PARALLEL = "1"; # ROCm memory optimization #HIP_VISIBLE_DEVICES = "0"; #ROCR_VISIBLE_DEVICES = "0"; # context length for agents OLLAMA_CONTEXT_LENGTH = "128000"; }; openFirewall = true; host = "0.0.0.0"; # don't want to make this available via load-balancer yet, so making it available on the local network }; open-webui = { enable = true; port = 21212; openFirewall = true; host = "0.0.0.0"; # don't want to make this available via load-balancer yet, so making it available on the local network }; }; users.users.ollama = { extraGroups = [ "render" "video" ]; group = "ollama"; isSystemUser = true; }; users.groups.ollama = { }; systemd.services = { ollama.serviceConfig = { Nice = 19; IOSchedulingPriority = 7; }; ollama-model-loader.serviceConfig = { Nice = 19; CPUWeight = 50; IOSchedulingClass = "idle"; IOSchedulingPriority = 7; }; }; }