Files
packetserver/packetserver/runner/podman.py
2025-03-18 22:50:24 -04:00

463 lines
19 KiB
Python

"""Uses podman to run jobs in containers."""
import time
from ZEO import client
from . import Runner, Orchestrator, RunnerStatus, RunnerFile, scripts_tar
from packetserver.runner.constants import podman_run_command
from urllib.parse import urlparse
from collections import namedtuple
from typing import Optional, Iterable, Union
from traceback import format_exc
import podman
import gzip
from podman.domain.containers import Container
import podman.errors
import os
import os.path
import logging
import datetime
from os.path import basename, dirname
from packetserver.common.util import bytes_to_tar_bytes, random_string, extract_tar_bytes, bytes_tar_has_files, \
TarFileExtractor
from packetserver import VERSION as packetserver_version
import re
from threading import Thread
from io import BytesIO
env_splitter_rex = '''([a-zA-Z0-9]+)=([a-zA-Z0-9]*)'''
PodmanOptions = namedtuple("PodmanOptions", ["default_timeout", "max_timeout", "image_name",
"max_active_jobs", "container_keepalive", "name_prefix"])
class PodmanRunner(Runner):
def __init__(self, username: str, args: Union[str, list[str]], job_id: int, container: Container,
environment: Optional[dict] = None, timeout_secs: str = 300, labels: Optional[list] = None,
files: list[RunnerFile] = None):
super().__init__(username, args, job_id, environment=environment, timeout_secs=timeout_secs,
labels=labels, files=files)
self._artifact_archive = b''
if not container.inspect()['State']['Running']:
raise ValueError(f"Container {container} is not in state Running.")
self.container = container
self._thread = None
self.env['PACKETSERVER_JOBID'] = str(job_id)
self.job_path = os.path.join("/home", self.username, ".packetserver", str(job_id))
self.archive_path = os.path.join("/artifact_output", f"{str(job_id)}.tar.gz")
def thread_runner(self):
self.status = RunnerStatus.RUNNING
logging.debug(f"Thread for runner {self.job_id} started. Command for {(type(self.args))}:\n{self.args}")
# run the exec call
if type(self.args) is str:
logging.debug(f"Running string: {self.args}")
res = self.container.exec_run(cmd=self.args, environment=self.env, user=self.username, demux=True,
workdir=self.job_path)
else:
logging.debug(f"Running iterable: {list(self.args)}")
res = self.container.exec_run(cmd=list(self.args), environment=self.env, user=self.username, demux=True,
workdir=self.job_path)
logging.debug(str(res))
# cleanup housekeeping
self.status = RunnerStatus.STOPPING
self._result = res
# run cleanup script
logging.debug(f"Running cleanup script for {self.job_id}")
end_res = self.container.exec_run("bash /root/scripts/job_end_script.sh",
environment=self.env, user="root", tty=True)
logging.debug(f"End result: {end_res}")
if end_res[0] != 0:
logging.error(f"End Job script failed:\n{end_res[1].decode()}")
# collect any artifacts
try:
retrieved_tar_bytes = b''.join(self.container.get_archive(self.archive_path)[0])
art_tar_bytes = extract_tar_bytes(retrieved_tar_bytes)[1]
logging.debug(f"bytes retrieved: {retrieved_tar_bytes}")
if bytes_tar_has_files(gzip.GzipFile(fileobj=BytesIO(art_tar_bytes))):
logging.debug("found artifacts; attaching to runner object")
self._artifact_archive = art_tar_bytes
else:
logging.debug(f"no artifacts returned for job {self.job_id}")
except:
logging.warning(f"Error retrieving artifacts for {self.job_id}:\n{format_exc()}")
self._artifact_archive = b''
self.finished_at = datetime.datetime.now()
# set final status to FAILED or SUCCEEDED
if self.return_code == 0:
self.status = RunnerStatus.SUCCESSFUL
else:
self.status = RunnerStatus.FAILED
@property
def has_artifacts(self) -> bool:
if self._artifact_archive == b'':
return False
else:
return True
@property
def artifacts(self) -> TarFileExtractor:
if self._artifact_archive == b'':
return TarFileExtractor(BytesIO(b''))
else:
return TarFileExtractor(gzip.GzipFile(fileobj=BytesIO(self._artifact_archive)))
@property
def output(self) -> bytes:
return self._result[1][0]
@property
def output_str(self) -> str:
try:
output = self.output.decode()
except:
output = str(self.output)
return output
@property
def errors(self) -> str:
return self._result[1][1]
@property
def errors_str(self) -> str:
return self._result[1][1].decode()
@property
def return_code(self) -> int:
return self._result[0]
def start(self):
logging.debug(f"Starting runner {self.job_id} for {self.username} with command:\n({type(self.args)}){self.args}")
self.status = RunnerStatus.STARTING
# Run job setup script
logging.debug(f"Running job setup script for {self.job_id} runner")
setup_res = self.container.exec_run("bash /root/scripts/job_setup_script.sh",
environment=self.env, user="root", tty=True)
logging.debug(f"Job {self.job_id} setup script:\n{str(setup_res[1])}")
if setup_res[0] != 0:
self.status = RunnerStatus.FAILED
raise RuntimeError(f"Couldn't run setup scripts for {self.job_id}:\n{setup_res[1]}")
# put files where they need to be
for f in self.files:
logging.debug(f"Adding file {f} for job {self.job_id}")
if not f.isabs:
dest = os.path.join(self.job_path, f.destination_path)
dirn = os.path.dirname(dest)
else:
dest = f.destination_path
dirn = f.dirname
if self.container.put_archive(dirn, f.tar_data()):
logging.debug(f"Placed file {dest} for job {self.job_id}")
else:
logging.warning(f"Failed to place file {dest} for job {self.job_id}!!")
if not f.root_owned:
self.container.exec_run(f"chown -R {self.username} {dest}")
# start thread
logging.debug(f"Starting runner thread for {self.job_id}")
self._thread = Thread(target=self.thread_runner)
super().start()
self._thread.start()
class PodmanOrchestrator(Orchestrator):
def __init__(self, uri: Optional[str] = None, options: Optional[PodmanOptions] = None):
super().__init__()
self.started = False
self.user_containers = {}
self.manager_thread = None
self._client = None
self._five_min_ticker = 600
if uri:
self.uri = uri
else:
self.uri = f"unix:///run/user/{os.getuid()}/podman/podman.sock"
uri_parsed = urlparse(self.uri)
if uri_parsed.scheme == "unix":
if not os.path.exists(uri_parsed.path):
raise FileNotFoundError(f"Podman socket not found: {self.uri}")
test_client = self.new_client()
logging.debug(f"Testing podman socket. Version: {test_client.info()}")
self._client = None
self.username_containers = {}
if options:
self.opts = options
else:
self.opts = PodmanOptions(default_timeout=300, max_timeout=3600, image_name="debian", max_active_jobs=5,
container_keepalive=300, name_prefix="packetserver_")
@property
def client(self) -> Optional[podman.PodmanClient]:
return self._client
def new_client(self) -> podman.PodmanClient:
cli = podman.PodmanClient(base_url=self.uri)
self._client = cli
return cli
def add_file_to_user_container(self, username: str, data: bytes, path: str, root_owned=False):
cli = self.client
file_dir = dirname(path)
tar_data_bytes = bytes_to_tar_bytes(basename(path), data)
con = cli.containers.get(self.get_container_name(username))
res = con.exec_run(cmd=["mkdir", "-p", file_dir], user="root")
if res[0] != 1:
raise RuntimeError("Couldn't create directory")
con.put_archive(file_dir, tar_data_bytes)
def get_file_from_user_container(self, username: str, path: str) -> bytes :
cli = self.client
con = cli.containers.get(self.get_container_name(username))
tar_result = con.get_archive(path)
bytes_tar = b"".join(list(tar_result[0]))
return extract_tar_bytes(bytes_tar)[1]
def podman_container_env(self, container_name: str) -> dict:
cli = self.client
try:
con = cli.containers.get(container_name)
splitter = re.compile(env_splitter_rex)
env = {}
for i in con.inspect()['Config']['Env']:
m = splitter.match(i)
if m:
env[m.groups()[0]] = m.groups()[1]
return env
except podman.errors.exceptions.NotFound as e:
return {}
def podman_user_container_env(self, username: str) -> dict:
container_name = self.get_container_name(username)
return self.podman_container_env(container_name)
def podman_start_user_container(self, username: str) -> Container:
container_env = {
"PACKETSERVER_VERSION": packetserver_version,
"PACKETSERVER_USER": username.strip().lower()
}
logging.debug(f"Starting user container for {username} with command {podman_run_command}")
con = self.client.containers.create(self.opts.image_name, name=self.get_container_name(username),
command=podman_run_command,
environment=container_env, user="root")
con.start()
logging.debug(f"Container started for {username} from image {self.opts.image_name}")
started_at = datetime.datetime.now()
logging.debug(f"Container state: \n{con.inspect()['State']}")
while con.inspect()['State']['Status'] not in ['exited', 'running']:
logging.debug("Container state not in ['exited', 'running']")
now = datetime.datetime.now()
if (now - started_at).total_seconds() > 300:
con.stop()
con.remove()
time.sleep(.1)
time.sleep(.5)
if con.inspect()['State']['Status'] != 'running':
logging.debug(f"Container for {username} isn't running. Cleaning it up.")
try:
con.stop()
except:
pass
try:
con.rename(f"{self.get_container_name(username)}_old")
con.remove()
except:
pass
raise RuntimeError(f"Couldn't start container for user {username}")
if not con.put_archive('/root/scripts', scripts_tar()):
con.stop()
con.remove()
raise RuntimeError("Failed to upload job scripts to container.")
res = con.exec_run(cmd=["bash", "/root/scripts/container_setup_script.sh"], tty=True, user="root")
logging.debug(f"Container setup script run:\n{res[1].decode()}\nExit Code: {res[0]}")
if res[0] != 0:
logging.error(f"Container setup script failed:\n{res[1].decode()}\nExit Code: {res[0]}")
con.stop()
con.remove()
raise RuntimeError(f"Container setup script failed:\n{res[1].decode()}\nExit Code: {res[0]}")
self.touch_user_container(username)
return con
def podman_remove_container_name(self, container_name: str):
cli = self.client
logging.debug(f"Attempting to remove container named {container_name}")
try:
con = cli.containers.get(container_name)
if con.inspect()['State']['Status'] == 'running':
con.exec_run(cmd="touch /root/ENDNOW", user="root")
time.sleep(1)
except podman.errors.exceptions.NotFound as e:
logging.warning(f"Didn't find container named {container_name}")
return
try:
con.rename(f"{container_name}_{random_string()}")
except:
logging.error(f"Couldn't rename container:\n{format_exc()}")
if con.inspect()['State']['Status'] != 'exited':
try:
con.stop(timeout=10)
except:
logging.error(f"Couldn't stop container:\n{format_exc()}")
try:
con.remove()
except:
logging.error(f"Couldn't remove container:\n{format_exc()}")
return
def podman_stop_user_container(self, username: str):
self.podman_remove_container_name(self.get_container_name(username))
if self.get_container_name(username) in self.user_containers:
del self.user_containers[self.get_container_name(username)]
def podman_user_container_exists(self, username: str) -> bool:
try:
self.client.containers.get(self.get_container_name(username))
return True
except podman.errors.exceptions.NotFound:
return False
def podman_run_command_simple(self, username: str, command: Iterable[str], as_root: bool = True) -> int:
"""Runs command defined by arguments iterable in container. As root by default. Returns exit code."""
container_name = self.get_container_name(username)
un = username.lower().strip()
con = self.client.containers.get(container_name)
if as_root:
un = 'root'
return con.exec_run(list(command), user=un)[0]
def clean_orphaned_containers(self):
cli = self.client
for i in cli.containers.list(all=True):
if self.opts.name_prefix in str(i.name):
if str(i.name) not in self.user_containers:
self.podman_remove_container_name(str(i.name))
def get_container_name(self, username: str) -> str:
return self.opts.name_prefix + username.lower().strip()
def get_username_from_container_name(self, container_name: str) -> str:
if not self.opts.name_prefix in container_name:
raise ValueError(f"{container_name} is not a user container")
return container_name.replace(self.opts.name_prefix, "")
def touch_user_container(self, username: str):
self.user_containers[self.get_container_name(username)] = datetime.datetime.now()
def start_user_container(self, username: str) -> Container:
if not self.podman_user_container_exists(username):
con = self.podman_start_user_container(username)
else:
con = self.client.containers.get(self.get_container_name(username))
return con
def clean_containers(self):
"""Checks running containers and stops them if they have been running too long."""
containers_to_clean = set()
for c in self.user_containers:
if (datetime.datetime.now() - self.user_containers[c]).total_seconds() > self.opts.container_keepalive:
logging.debug(f"Container {c} no activity for {self.opts.container_keepalive} seconds. Clearing.")
containers_to_clean.add(c)
for c in list(containers_to_clean):
self.podman_remove_container_name(c)
del self.user_containers[c]
def user_runners_in_process(self, username: str) -> int:
un = username.strip().lower()
count = 0
for r in self.runners:
if r.is_in_process:
if r.username == un:
count = count + 1
return count
def user_running(self, username: str) -> bool:
if self.user_runners_in_process(username) > 0:
return True
else:
return False
def runners_in_process(self) -> int:
count = 0
for r in self.runners:
if not r.is_finished():
count = count + 1
return count
def runners_available(self) -> bool:
if not self.started:
return False
if self.runners_in_process() < self.opts.max_active_jobs:
return True
return False
def new_runner(self, username: str, args: Union[str, list[str]], job_id: int, environment: Optional[dict] = None,
timeout_secs: str = 300, refresh_db: bool = True, labels: Optional[list] = None,
files: list[RunnerFile] = None) -> Optional[PodmanRunner]:
if not self.started:
logging.warning("Attempted to queue a runner when not started")
return None
with self.runner_lock:
if not self.runners_available():
logging.warning("Attempted to queue a runner when no runner slots available.")
return None
con = self.start_user_container(username)
logging.debug(f"Started a container for {username} successfully.")
self.touch_user_container(username)
logging.debug(f"Queuing a runner on container {con}, with command '{args}' of type '{type(args)}'")
runner = PodmanRunner(username, args, job_id, con, environment=environment, timeout_secs=timeout_secs,
labels=labels, files=files)
self.runners.append(runner)
runner.start()
return runner
def manage_lifecycle(self):
if not self.started:
return
with self.runner_lock:
for r in self.runners:
if not r.is_finished():
self.touch_user_container(r.username)
self.clean_containers()
if self._five_min_ticker >= 600:
self.clean_orphaned_containers()
self._five_min_ticker = 0
def manager(self):
logging.debug("Starting podman orchestrator thread.")
while self.started:
self.manage_lifecycle()
time.sleep(.5)
logging.debug("Stopping podman orchestrator thread.")
def start(self):
if not self.started:
self.new_client()
self.clean_orphaned_containers()
self._five_min_ticker = 0
self.started = True
self.manager_thread = Thread(target=self.manager)
self.manager_thread.start()
def __del__(self):
if self.started:
self.stop()
def stop(self):
logging.debug("Stopping podman orchestrator.")
self.started = False
cli = self.client
self.user_containers = {}
self.clean_orphaned_containers()
if self.manager_thread is not None:
logging.debug("Joining orchestrator manager thread.")
self.manager_thread.join(timeout=15)
logging.debug("Orchestrator manager thread stopped")
self.manager_thread = None
self._client = None