From 66b6ea4fc46cbd94f4759e2f886881ac3bccdbca Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Wed, 6 Nov 2024 13:08:28 +0100 Subject: [PATCH 01/19] refs #1104 return inventory inline --- linux/debian/changelog | 6 +++ ogcore-mock.py | 2 +- src/VERSION | 2 +- .../modules/server/CloningEngine/__init__.py | 53 ++++++++----------- src/opengnsys/workers/oglive_worker.py | 8 +-- 5 files changed, 35 insertions(+), 36 deletions(-) diff --git a/linux/debian/changelog b/linux/debian/changelog index 509b0b7..736204c 100644 --- a/linux/debian/changelog +++ b/linux/debian/changelog @@ -1,3 +1,9 @@ +ogagent (1.4.5~pre1-1) stable; urgency=medium + + * CrearImagen: return inventory inline + + -- OpenGnsys developers Wed, 06 Nov 2024 12:41:14 +0100 + ogagent (1.4.4-1) stable; urgency=medium * Use logger.debug() to prevent the windows agent from dying diff --git a/ogcore-mock.py b/ogcore-mock.py index 716e52f..2c1e69d 100644 --- a/ogcore-mock.py +++ b/ogcore-mock.py @@ -178,7 +178,7 @@ def oac_recibe_archivo(): logging.info(f'dec ({dec})') return jsonify({'anything':'anything'}) ## if we return {}, then we trigger "if not {}" which happens to be true -@app.route('/opengnsys/rest/ogAdmClient/callback', methods=['POST']) +@app.route('/opengnsys/rest/clients/status/webhook', methods=['POST']) def oac_callback(): logging.info(f'{request.get_json()}') return jsonify({'anything':'anything'}) diff --git a/src/VERSION b/src/VERSION index 1c99cf0..e167f12 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.4.4 +1.4.5-pre1 diff --git a/src/opengnsys/modules/server/CloningEngine/__init__.py b/src/opengnsys/modules/server/CloningEngine/__init__.py index b95bd23..08ff75a 100644 --- a/src/opengnsys/modules/server/CloningEngine/__init__.py +++ b/src/opengnsys/modules/server/CloningEngine/__init__.py @@ -48,7 +48,7 @@ class CloningEngineWorker (ogLiveWorker): def onDeactivation (self): logger.debug ('onDeactivation') - def InventariandoSoftware (self, dsk, par, sw, nfn): + def InventariandoSoftware (self, dsk, par, nfn): sft_src = f'/tmp/CSft-{self.IPlocal}-{par}' try: self.interfaceAdmin (nfn, [dsk, par, sft_src]) @@ -58,31 +58,23 @@ class CloningEngineWorker (ogLiveWorker): if herror: logger.warning ('Error al ejecutar el comando') + b64 = '' self.muestraMensaje (20) else: if not os.path.exists (sft_src): raise Exception (f'interfaceAdmin({nfn}) returned success but did not create file ({sft_src})') sft_src_contents = Path (sft_src).read_bytes() - ## Envía fichero de inventario al servidor - sft_dst = f'/tmp/Ssft-{self.IPlocal}-{par}' ## Nombre que tendra el archivo en el Servidor - logger.debug ('sending recibeArchivo to server') - res = self.enviaMensajeServidor ('recibeArchivo', { 'nfl': sft_dst, 'contents': base64.b64encode (sft_src_contents).decode ('utf-8') }) - logger.debug (res) - if not res: - herror = 12 ## Error de envío de fichero por la red - raise Exception ('Ha ocurrido algún problema al enviar un archivo por la red') + b64 = base64.b64encode (sft_src_contents).decode ('utf-8') self.muestraMensaje (19) - if not sw: - cmd = { - 'nfn': 'RESPUESTA_InventarioSoftware', - 'par': par, - 'sft': sft_dst, - } - return self.respuestaEjecucionComando (cmd, herror, 0) - - return {'true':'true'} ## XXX + cmd = { + 'nfn': 'RESPUESTA_InventarioSoftware', + 'dsk': dsk, ## not in the original C code, around ogAdmClient.c:1944 + 'par': par, + 'contents': b64, + } + return self.respuestaEjecucionComando (cmd, herror, 0) def do_CrearImagen (self, post_params): for k in ['dsk', 'par', 'cpt', 'idi', 'nci', 'ipr', 'nfn', 'ids']: @@ -102,13 +94,14 @@ class CloningEngineWorker (ogLiveWorker): self.muestraMensaje (7) try: - res = self.InventariandoSoftware (dsk, par, False, 'InventarioSoftware') ## Crea inventario Software previamente + res = self.InventariandoSoftware (dsk, par, 'InventarioSoftware') ## Crea inventario Software previamente except: logger.warning ('Error al ejecutar el comando') return {} - if res: + if res['contents']: self.muestraMensaje (2) + inv_sft = res['contents'] try: self.interfaceAdmin (nfn, [dsk, par, nci, ipr]) self.muestraMensaje (9) @@ -119,16 +112,18 @@ class CloningEngineWorker (ogLiveWorker): herror = 1 else: logger.warning ('Error al ejecutar el comando') + inv_sft = '' self.muestraMenu() cmd = { - 'nfn': 'RESPUESTA_CrearImagen', - 'idi': idi, ## Identificador de la imagen - 'dsk': dsk, ## Número de disco - 'par': par, ## Número de partición de donde se creó - 'cpt': cpt, ## Tipo o código de partición - 'ipr': ipr, ## Ip del repositorio donde se alojó + 'nfn': 'RESPUESTA_CrearImagen', + 'idi': idi, ## Identificador de la imagen + 'dsk': dsk, ## Número de disco + 'par': par, ## Número de partición de donde se creó + 'cpt': cpt, ## Tipo o código de partición + 'ipr': ipr, ## Ip del repositorio donde se alojó + 'inv_sft': inv_sft, } return self.respuestaEjecucionComando (cmd, herror, ids) @@ -275,16 +270,14 @@ class CloningEngineWorker (ogLiveWorker): self.muestraMensaje (7) try: - self.InventariandoSoftware (dsk, par, True, 'InventarioSoftware') + cmd = self.InventariandoSoftware (dsk, par, 'InventarioSoftware') herror = 0 except: logger.warning ('Error al ejecutar el comando') + cmd = { 'nfn': 'RESPUESTA_InventarioSoftware' } herror = 1 self.muestraMenu() - cmd = { - 'nfn': 'RESPUESTA_InventarioSoftware', - } return self.respuestaEjecucionComando (cmd, herror, ids) def process_CrearImagen (self, path, get_params, post_params, server): diff --git a/src/opengnsys/workers/oglive_worker.py b/src/opengnsys/workers/oglive_worker.py index 63d1466..43291fd 100644 --- a/src/opengnsys/workers/oglive_worker.py +++ b/src/opengnsys/workers/oglive_worker.py @@ -139,7 +139,7 @@ class ogLiveWorker(ServerWorker): def notifier (self, result): logger.debug (f'notifier() called, result ({result})') - res = self.REST.sendMessage ('/clients/status/webhook', result) + res = self.REST.sendMessage ('clients/status/webhook', result) def mon (self): while True: @@ -153,7 +153,7 @@ class ogLiveWorker(ServerWorker): if not elem['thread'].is_alive(): logger.debug (f'is no longer alive, k ({k}) thread ({elem["thread"]})') elem['running'] = False - elem['result'] = elem['thread'].result + elem['result'] = elem['thread'].result ## race condition: KeyError: 'thread' del elem['thread'] self.notifier (elem['result']) @@ -169,9 +169,9 @@ class ogLiveWorker(ServerWorker): ''' if parametros: - proc = ['bash', '-c', '{} bash -x {} {}'.format (devel_bash_prefix, exe, ' '.join (parametros))] + proc = ['bash', '-c', '{} {} {}'.format (devel_bash_prefix, exe, ' '.join (parametros))] else: - proc = ['bash', '-c', '{} bash -x {}'.format (devel_bash_prefix, exe)] + proc = ['bash', '-c', '{} {}'.format (devel_bash_prefix, exe)] logger.debug ('subprocess.run ("{}", capture_output=True)'.format (proc)) p = subprocess.run (proc, capture_output=True) ## DEBUG -- 2.40.1 From b8733fea49b9a836062c8a1adc628d4280fd06a8 Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Wed, 6 Nov 2024 13:23:11 +0100 Subject: [PATCH 02/19] refs #1107 run one monitoring thread, not two --- src/opengnsys/modules/server/CloningEngine/__init__.py | 2 +- src/opengnsys/modules/server/ogAdmClient/__init__.py | 2 +- src/opengnsys/workers/oglive_worker.py | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/opengnsys/modules/server/CloningEngine/__init__.py b/src/opengnsys/modules/server/CloningEngine/__init__.py index 08ff75a..06b704f 100644 --- a/src/opengnsys/modules/server/CloningEngine/__init__.py +++ b/src/opengnsys/modules/server/CloningEngine/__init__.py @@ -42,7 +42,7 @@ class CloningEngineWorker (ogLiveWorker): REST = None # REST object def onActivation (self): - super().onActivation() + super().onActivation (run_monitoring_thread=False) logger.info ('onActivation ok') def onDeactivation (self): diff --git a/src/opengnsys/modules/server/ogAdmClient/__init__.py b/src/opengnsys/modules/server/ogAdmClient/__init__.py index e151f4c..286c9e5 100644 --- a/src/opengnsys/modules/server/ogAdmClient/__init__.py +++ b/src/opengnsys/modules/server/ogAdmClient/__init__.py @@ -279,7 +279,7 @@ class ogAdmClientWorker (ogLiveWorker): #} def onActivation (self): - super().onActivation() + super().onActivation (run_monitoring_thread=True) logger.info ('Inicio de sesion') logger.info ('Abriendo sesión en el servidor de Administración') if (not self.inclusionCliente()): diff --git a/src/opengnsys/workers/oglive_worker.py b/src/opengnsys/workers/oglive_worker.py index 43291fd..cfc3318 100644 --- a/src/opengnsys/workers/oglive_worker.py +++ b/src/opengnsys/workers/oglive_worker.py @@ -288,7 +288,7 @@ class ogLiveWorker(ServerWorker): return obj - def onActivation (self): + def onActivation (self, run_monitoring_thread): if not os.path.exists ('/scripts/oginit'): ## no estamos en oglive, este modulo no debe cargarse ## esta lógica la saco de src/opengnsys/linux/operations.py, donde hay un if similar @@ -322,7 +322,8 @@ class ogLiveWorker(ServerWorker): if not self.tomaMAClocal(): raise Exception ('Se han generado errores. No se puede continuar la ejecución de este módulo') - threading.Thread (name='monitoring_thread', target=self.mon, daemon=True).start() + if run_monitoring_thread: ## should be true for exactly one ogLiveWorker + threading.Thread (name='monitoring_thread', target=self.mon, daemon=True).start() def _long_running_job (self, name, f, args): any_job_running = False -- 2.40.1 From 6163c0b435973bb7f3a8ce5d61062be6b5aa3008 Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Wed, 6 Nov 2024 13:23:32 +0100 Subject: [PATCH 03/19] refs #1105 include job_id in async responses --- src/opengnsys/workers/oglive_worker.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/opengnsys/workers/oglive_worker.py b/src/opengnsys/workers/oglive_worker.py index cfc3318..599aba5 100644 --- a/src/opengnsys/workers/oglive_worker.py +++ b/src/opengnsys/workers/oglive_worker.py @@ -137,8 +137,9 @@ class ogLiveWorker(ServerWorker): "Error desconocido", ] - def notifier (self, result): - logger.debug (f'notifier() called, result ({result})') + def notifier (self, job_id, result): + logger.debug (f'notifier() called, job_id ({job_id}) result ({result})') + result['job_id'] = job_id res = self.REST.sendMessage ('clients/status/webhook', result) def mon (self): @@ -155,7 +156,7 @@ class ogLiveWorker(ServerWorker): elem['running'] = False elem['result'] = elem['thread'].result ## race condition: KeyError: 'thread' del elem['thread'] - self.notifier (elem['result']) + self.notifier (k, elem['result']) time.sleep (1) -- 2.40.1 From ef0920079b17f37111299ab9b4f35a6a7f6179c5 Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Wed, 6 Nov 2024 13:35:12 +0100 Subject: [PATCH 04/19] refs #1107 do not try to avoid races, as there are none now --- src/opengnsys/workers/oglive_worker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/opengnsys/workers/oglive_worker.py b/src/opengnsys/workers/oglive_worker.py index 599aba5..91bec00 100644 --- a/src/opengnsys/workers/oglive_worker.py +++ b/src/opengnsys/workers/oglive_worker.py @@ -150,11 +150,10 @@ class ogLiveWorker(ServerWorker): if 'thread' not in elem: continue logger.debug (f'considering thread ({k})') try: elem['thread'].join (0.05) - except RuntimeError: pass ## race condition: a thread is created and this code runs before it is start()ed if not elem['thread'].is_alive(): logger.debug (f'is no longer alive, k ({k}) thread ({elem["thread"]})') elem['running'] = False - elem['result'] = elem['thread'].result ## race condition: KeyError: 'thread' + elem['result'] = elem['thread'].result del elem['thread'] self.notifier (k, elem['result']) -- 2.40.1 From dd82e4db5047218ffbb901c456110a50004dc93f Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Wed, 6 Nov 2024 13:40:39 +0100 Subject: [PATCH 05/19] refs #1106 remove vim swapfile from the deb packages --- linux/debian/changelog | 8 ++++++++ linux/debian/rules | 1 + 2 files changed, 9 insertions(+) diff --git a/linux/debian/changelog b/linux/debian/changelog index 736204c..6d9dfe2 100644 --- a/linux/debian/changelog +++ b/linux/debian/changelog @@ -1,3 +1,11 @@ +ogagent (1.4.5~pre2-1) stable; urgency=medium + + * Remove race condition due to several monitoring threads + * Include job_id in asynchronous responses + * Remove vim swapfiles from the package contents + + -- OpenGnsys developers Wed, 06 Nov 2024 13:24:03 +0100 + ogagent (1.4.5~pre1-1) stable; urgency=medium * CrearImagen: return inventory inline diff --git a/linux/debian/rules b/linux/debian/rules index ead6aa0..caa76e5 100755 --- a/linux/debian/rules +++ b/linux/debian/rules @@ -22,6 +22,7 @@ install: build dh_prep dh_installdirs $(MAKE) DESTDIR=$(CURDIR)/debian/ogagent install-ogagent + find $(CURDIR) -name '*.swp' -exec rm -f '{}' ';' binary-arch: build install # emptyness binary-indep: build install -- 2.40.1 From e2fcf022221fdbf5ec8822ed25c375796f7d9b27 Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Wed, 6 Nov 2024 14:17:27 +0100 Subject: [PATCH 06/19] refs #1107 fix syntax --- src/opengnsys/workers/oglive_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/opengnsys/workers/oglive_worker.py b/src/opengnsys/workers/oglive_worker.py index 91bec00..ada7bc2 100644 --- a/src/opengnsys/workers/oglive_worker.py +++ b/src/opengnsys/workers/oglive_worker.py @@ -149,7 +149,7 @@ class ogLiveWorker(ServerWorker): elem = self.thread_list[k] if 'thread' not in elem: continue logger.debug (f'considering thread ({k})') - try: elem['thread'].join (0.05) + elem['thread'].join (0.05) if not elem['thread'].is_alive(): logger.debug (f'is no longer alive, k ({k}) thread ({elem["thread"]})') elem['running'] = False -- 2.40.1 From 87a5258de5b99323df65568f47dc400603e7862d Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Fri, 15 Nov 2024 11:41:12 +0100 Subject: [PATCH 07/19] refs #1108 add WIP for killing subprocesses --- linux/debian/changelog | 6 + src/VERSION | 2 +- .../modules/server/CloningEngine/__init__.py | 7 + .../modules/server/ogAdmClient/__init__.py | 7 + src/opengnsys/workers/oglive_worker.py | 127 +++++++++++++++--- 5 files changed, 131 insertions(+), 18 deletions(-) diff --git a/linux/debian/changelog b/linux/debian/changelog index 6d9dfe2..09edfe7 100644 --- a/linux/debian/changelog +++ b/linux/debian/changelog @@ -1,3 +1,9 @@ +ogagent (1.4.5~pre3-1) stable; urgency=medium + + * Kill long running jobs in oglive + + -- OpenGnsys developers Wed, 06 Nov 2024 14:11:32 +0100 + ogagent (1.4.5~pre2-1) stable; urgency=medium * Remove race condition due to several monitoring threads diff --git a/src/VERSION b/src/VERSION index e167f12..bf08677 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.4.5-pre1 +1.4.5-pre3 diff --git a/src/opengnsys/modules/server/CloningEngine/__init__.py b/src/opengnsys/modules/server/CloningEngine/__init__.py index 06b704f..8eed397 100644 --- a/src/opengnsys/modules/server/CloningEngine/__init__.py +++ b/src/opengnsys/modules/server/CloningEngine/__init__.py @@ -322,3 +322,10 @@ class CloningEngineWorker (ogLiveWorker): def process_InventarioSoftware (self, path, get_params, post_params, server): logger.debug ('in process_InventarioSoftware, path "{}" get_params "{}" post_params "{}" server "{}"'.format (path, get_params, post_params, server)) return self._long_running_job ('InventarioSoftware', self.do_InventarioSoftware, args=(post_params,)) + + ## curl --insecure -X POST --data '{"job_id":"foo"}' https://192.168.2.199:8000/CloningEngine/KillJob + def process_KillJob (self, path, get_params, post_params, server): + logger.debug ('in process_KillJob, path "{}" get_params "{}" post_params "{}" server "{}"'.format (path, get_params, post_params, server)) + jid = post_params['job_id'] + r = self.killer (jid) + return r diff --git a/src/opengnsys/modules/server/ogAdmClient/__init__.py b/src/opengnsys/modules/server/ogAdmClient/__init__.py index 286c9e5..f08fc16 100644 --- a/src/opengnsys/modules/server/ogAdmClient/__init__.py +++ b/src/opengnsys/modules/server/ogAdmClient/__init__.py @@ -561,3 +561,10 @@ class ogAdmClientWorker (ogLiveWorker): def process_EjecutaComandosPendientes (self, path, get_params, post_params, server): logger.debug ('in process_EjecutaComandosPendientes, path "{}" get_params "{}" post_params "{}" server "{}"'.format (path, get_params, post_params, server)) return {'true':'true'} ## ogAdmClient.c:2138 + + ## curl --insecure -X POST --data '{"job_id":"foo"}' https://192.168.2.199:8000/ogAdmClient/KillJob + def process_KillJob (self, path, get_params, post_params, server): + logger.debug ('in process_KillJob, path "{}" get_params "{}" post_params "{}" server "{}"'.format (path, get_params, post_params, server)) + jid = post_params['job_id'] + r = self.killer (jid) + return r diff --git a/src/opengnsys/workers/oglive_worker.py b/src/opengnsys/workers/oglive_worker.py index ada7bc2..0634a22 100644 --- a/src/opengnsys/workers/oglive_worker.py +++ b/src/opengnsys/workers/oglive_worker.py @@ -35,6 +35,7 @@ import time import random import subprocess import threading +import signal from configparser import NoOptionError from opengnsys import REST @@ -57,6 +58,7 @@ class ThreadWithResult (threading.Thread): class ogLiveWorker(ServerWorker): thread_list = {} + thread_lock = threading.Lock() tbErroresScripts = [ "Se han generado errores desconocidos. No se puede continuar la ejecución de este módulo", ## 0 @@ -142,20 +144,65 @@ class ogLiveWorker(ServerWorker): result['job_id'] = job_id res = self.REST.sendMessage ('clients/status/webhook', result) + def killer (self, job_id): + logger.debug (f'killer() called, job_id ({job_id})') + if job_id not in self.thread_list: return { 'res': 2, 'der': 'Unknown job' } + + with self.thread_lock: + if 'thread' not in self.thread_list[job_id]: return { 'res': 2, 'der': 'Job is not running' } + t = self.thread_list[job_id]['thread'] + pid = self.thread_list[job_id]['child_pid'] + logger.debug (f'pid ({pid})') + try_times = 8 + sig = signal.SIGTERM + while True: + t.join (0.05) + if not t.is_alive(): + logger.debug (f'thread exited, yay!') + ## limpieza + self.q = None + self.thread_list[job_id]['child_pid'] = None + break + if pid: + if os.path.exists (f'/proc/{pid}'): + logger.debug (f'would os.kill pid ({pid})') + ## si el proceso se muere justo aquí, no pasa nada: la señal va al vacío + #os.kill (pid, sig) + else: + logger.debug (f'pid ({pid}) is gone, nothing to kill...') + self.thread_list[job_id]['child_pid'] = None + else: + logger.debug (f'oops no tenemos a quien matar') + if not try_times: break + if 4 == try_times: sig = signal.SIGKILL ## change signal after a few tries + try_times -= 1 + time.sleep (0.4) + + ## y si no lo hemos conseguido, qué?? + return {'job_id':job_id} + def mon (self): while True: - #print ('mon(): iterating') - for k in self.thread_list: - elem = self.thread_list[k] - if 'thread' not in elem: continue - logger.debug (f'considering thread ({k})') - elem['thread'].join (0.05) - if not elem['thread'].is_alive(): - logger.debug (f'is no longer alive, k ({k}) thread ({elem["thread"]})') - elem['running'] = False - elem['result'] = elem['thread'].result - del elem['thread'] - self.notifier (k, elem['result']) + with self.thread_lock: + for k in self.thread_list: + elem = self.thread_list[k] + if 'thread' not in elem: continue + logger.debug (f'considering thread ({k})') + + if self.q: + if not q.empty(): + elem['child_pid'] = q.get() + logger.debug (f'queue not empty, got pid ({elem["child_pid"]})') + else: + logger.debug (f'queue empty') + + elem['thread'].join (0.05) + if not elem['thread'].is_alive(): + logger.debug (f'is no longer alive, k ({k}) thread ({elem["thread"]})') + elem['running'] = False + elem['result'] = elem['thread'].result + del elem['thread'] + self.notifier (k, elem['result']) time.sleep (1) @@ -173,22 +220,36 @@ class ogLiveWorker(ServerWorker): else: proc = ['bash', '-c', '{} {}'.format (devel_bash_prefix, exe)] logger.debug ('subprocess.run ("{}", capture_output=True)'.format (proc)) - p = subprocess.run (proc, capture_output=True) + + #p = subprocess.run (proc, capture_output=True) + p = subprocess.Popen (proc, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if self.q: + self.q.put (p.pid) + else: + logger.debug ('oops, queremos escribir el PID del hijo a la cola pero no hay cola') + sout = serr = '' + while p.poll() is None: + for l in iter (p.stdout.readline, b''): sout += l.decode ('utf-8', 'ignore') + for l in iter (p.stderr.readline, b''): serr += l.decode ('utf-8', 'ignore') + time.sleep (1) + sout = sout.strip() + serr = serr.strip() + ## DEBUG logger.info (f'stdout follows:') - for l in p.stdout.strip().decode ('utf-8').splitlines(): + for l in sout.splitlines(): logger.info (f' {l}') logger.info (f'stderr follows:') - for l in p.stderr.strip().decode ('utf-8').splitlines(): + for l in serr.splitlines(): logger.info (f' {l}') ## /DEBUG if 0 != p.returncode: cmd_txt = ' '.join (proc) logger.error (f'command ({cmd_txt}) failed, stderr follows:') - for l in p.stderr.strip().decode ('utf-8').splitlines(): + for l in serr.splitlines(): logger.error (f' {l}') raise Exception (f'command ({cmd_txt}) failed, see log for details') - return p.stdout.strip().decode ('utf-8') + return sout def tomaIPlocal (self): try: @@ -303,6 +364,7 @@ class ogLiveWorker(ServerWorker): self.idproautoexec = None self.idcentro = None ## Identificador del centro self.idaula = None ## Identificador del aula + self.q = None ## for passing PIDs around try: url = self.service.config.get (self.name, 'remote') @@ -336,11 +398,42 @@ class ogLiveWorker(ServerWorker): return { 'job_id': None, 'message': 'some job is already running, refusing to launch another one' } job_id = '{}-{}'.format (name, ''.join (random.choice ('0123456789abcdef') for _ in range (8))) + import queue + self.q = queue.Queue() ## a single queue works for us because we never have more than one long_running_job at the same time self.thread_list[job_id] = { 'thread': ThreadWithResult (target=f, args=args), 'starttime': time.time(), + 'child_pid': None, 'running': True, 'result': None } self.thread_list[job_id]['thread'].start() return { 'job_id': job_id } + +## para matar threads tengo lo siguiente: +## - aqui en _long_running_job meto una cola en self.q +## - (self.q fue inicializado a None al instanciar el objeto, para evitar error "objeto no tiene 'q'") +## - en el thread_list también tengo un child_pid para almacenar el pid de los procesos hijos +## - en interfaceAdm() al hacer subprocess.Popen(), recojo el pid y lo escribo en la queue +## - en mon() recojo pids de la queue y los meto en thread_list 'child_pid' +## - algunas funciones llaman a interfaceAdm más de una vez, y escriben más de un pid en la cola, y en mon() voy recogiendo y actualizando +## - por ejemplo EjecutarScript llama a interfaceAdm() y luego llama a LeeConfiguracion() el cual llama a interfaceAdm() otra vez +## - y cuando nos llamen a KillJob, terminamos en killer() el cual coge el 'child_pid' y zas +## - pero a lo mejor el child ya terminó +## - o a lo mejor el KillJob nos llegó demasiado pronto y todavía no hubo ningún child +## +## está sin probar. Simplemente probé que el agente arranca (o sea, que no lo rompí con estos cambios) +## versión 1.4.5-pre3, desplegada en entornos de desarrollo y funciona bien +## la idea sería mandarle un EjecutarScript 'sleep 30' y luego un KillJob +## +## $ curl --insecure -X POST --data '{"nfn":"EjecutarScript","scp":"cd /usr; sleep 30; pwd; ls","ids":"0"}' https://192.168.2.199:8000/ogAdmClient/EjecutarScript +## {"job_id": "EjecutarScript-333feb3f"} +## $ curl --insecure -X POST --data '{"job_id":"EjecutarScript-333feb3f"}' https://192.168.2.199:8000/CloningEngine/KillJob +## +## el KillJob de primeras no va a hacer nada (la llamada a os.kill() está comentada) +## entonces sería probar primero que el flujo va como espero: +## - que primero salga "would kill pid" en el log +## - y pasados unos segundos, al llamar a KillJob otra vez, salga "pid is gone" +## o en otra prueba, metiéndole un time.sleep() a piñón en interfaceAdm() antes de lanzar un hijo +## - que salga "oops no tenemos a quien matar" en el log +## y ya con esto comprobado, descomentar el os.kill() y hacer pruebas reales -- 2.40.1 From 029d3a778d6108b3c6fb7fc83b2aaef4b184bacc Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Fri, 15 Nov 2024 11:59:40 +0100 Subject: [PATCH 08/19] refs #1152 handle HTTP error responses without dying --- src/opengnsys/RESTApi.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/opengnsys/RESTApi.py b/src/opengnsys/RESTApi.py index 3e3468f..3abcfc9 100644 --- a/src/opengnsys/RESTApi.py +++ b/src/opengnsys/RESTApi.py @@ -168,4 +168,9 @@ class REST(object): url = self._getUrl(msg) logger.debug('Requesting {}'.format(url)) - return self._request(url, data) + try: + res = self._request(url, data) + return res + except: + logger.exception() + return None -- 2.40.1 From 9ac107dde4e28d4b93f501cc3379faed78b3439d Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Fri, 15 Nov 2024 12:23:42 +0100 Subject: [PATCH 09/19] refs #1112 use envvars for configuration --- linux/debian/changelog | 7 +++++++ src/VERSION | 2 +- src/cfg/ogagent.cfg | 12 ++++++------ src/opengnsys/modules/server/OpenGnSys/__init__.py | 13 ++++++++++++- src/opengnsys/workers/oglive_worker.py | 13 ++++++++++++- 5 files changed, 38 insertions(+), 9 deletions(-) diff --git a/linux/debian/changelog b/linux/debian/changelog index 09edfe7..7a44674 100644 --- a/linux/debian/changelog +++ b/linux/debian/changelog @@ -1,3 +1,10 @@ +ogagent (1.4.5~pre4-1) stable; urgency=medium + + * Don't die when ogcore returns HTTP 4xx or 5xx + * Get ogcore IP and port from the environment + + -- OpenGnsys developers Fri, 15 Nov 2024 11:43:01 +0100 + ogagent (1.4.5~pre3-1) stable; urgency=medium * Kill long running jobs in oglive diff --git a/src/VERSION b/src/VERSION index bf08677..e2a171d 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.4.5-pre3 +1.4.5-pre4 diff --git a/src/cfg/ogagent.cfg b/src/cfg/ogagent.cfg index f9b012b..2d53f6c 100644 --- a/src/cfg/ogagent.cfg +++ b/src/cfg/ogagent.cfg @@ -7,9 +7,9 @@ port=8000 #path=test_modules/server,more_modules/server # Remote OpenGnsys Service -remote=https://192.168.2.1/opengnsys/rest +remote={}://{}/opengnsys/rest # Alternate OpenGnsys Service (comment out to enable this option) -#altremote=https://10.0.2.2/opengnsys/rest +#altremote={}://{}/opengnsys/rest # Execution level (permitted operations): status, halt, full level=full @@ -23,15 +23,15 @@ log=DEBUG [ogAdmClient] #path=test_modules/server,more_modules/server -remote=https://192.168.2.1/opengnsys/rest +remote={}://{}/opengnsys/rest log=DEBUG pathinterface=/opt/opengnsys/interfaceAdm -urlMenu=https://192.168.2.1/opengnsys/varios/menubrowser.php +urlMenu={}://{}/opengnsys/varios/menubrowser.php urlMsg=http://localhost/cgi-bin/httpd-log.sh [CloningEngine] -remote=https://192.168.2.1/opengnsys/rest +remote={}://{}/opengnsys/rest log=DEBUG pathinterface=/opt/opengnsys/interfaceAdm -urlMenu=https://192.168.2.1/opengnsys/varios/menubrowser.php +urlMenu={}://{}/opengnsys/varios/menubrowser.php urlMsg=http://localhost/cgi-bin/httpd-log.sh diff --git a/src/opengnsys/modules/server/OpenGnSys/__init__.py b/src/opengnsys/modules/server/OpenGnSys/__init__.py index 259544a..6b63be0 100644 --- a/src/opengnsys/modules/server/OpenGnSys/__init__.py +++ b/src/opengnsys/modules/server/OpenGnSys/__init__.py @@ -115,9 +115,18 @@ class OpenGnSysWorker(ServerWorker): t = 0 # Count of time # Generate random secret to send on activation self.random = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(self.length)) + # Ensure cfg has required configuration variables or an exception will be thrown + ogcore_scheme = os.environ['OGAGENTCFG_OGCORE_SCHEME'] or 'https' + ogcore_ip = os.environ['OGAGENTCFG_OGCORE_IP'] or '192.168.2.1' + ogcore_port = os.environ['OGAGENTCFG_OGCORE_PORT'] or '8443' + ogcore_altip = os.environ['OGAGENTCFG_OGCORE_ALTIP'] or '192.168.2.254' + ogcore_altport = os.environ['OGAGENTCFG_OGCORE_ALTPORT'] or '8443' + ogcore_ip_port = ':'.join (map (str, filter (None, [ogcore_ip, ogcore_port ]))) + ogcore_altip_port = ':'.join (map (str, filter (None, [ogcore_altip, ogcore_altport ]))) try: url = self.service.config.get(self.name, 'remote') + url = url.format (ogcore_scheme, ogcore_ip_port) except NoOptionError as e: logger.error("Configuration error: {}".format(e)) raise e @@ -161,7 +170,9 @@ class OpenGnSysWorker(ServerWorker): logger.warn (str (e)) # Trying to initialize on alternative server, if defined # (used in "exam mode" from the University of Seville) - self.REST = REST(self.service.config.get(self.name, 'altremote')) + alturl = self.service.config.get(self.name, 'altremote') + alturl = alturl.format (ogcore_scheme, ogcore_altip_port) + self.REST = REST(alturl) self.REST.sendMessage('ogagent/started', {'mac': self.interface.mac, 'ip': self.interface.ip, 'secret': self.random, 'ostype': operations.os_type, 'osversion': operations.os_version, 'alt_url': True, diff --git a/src/opengnsys/workers/oglive_worker.py b/src/opengnsys/workers/oglive_worker.py index 0634a22..1800a24 100644 --- a/src/opengnsys/workers/oglive_worker.py +++ b/src/opengnsys/workers/oglive_worker.py @@ -142,7 +142,7 @@ class ogLiveWorker(ServerWorker): def notifier (self, job_id, result): logger.debug (f'notifier() called, job_id ({job_id}) result ({result})') result['job_id'] = job_id - res = self.REST.sendMessage ('clients/status/webhook', result) + self.REST.sendMessage ('clients/status/webhook', result) def killer (self, job_id): logger.debug (f'killer() called, job_id ({job_id})') @@ -366,12 +366,23 @@ class ogLiveWorker(ServerWorker): self.idaula = None ## Identificador del aula self.q = None ## for passing PIDs around + ogcore_scheme = os.environ['OGAGENTCFG_OGCORE_SCHEME'] or 'https' + ogcore_ip = os.environ['OGAGENTCFG_OGCORE_IP'] or '192.168.2.1' + ogcore_port = os.environ['OGAGENTCFG_OGCORE_PORT'] or '8443' + urlmenu_scheme = os.environ['OGAGENTCFG_URLMENU_SCHEME'] or 'https' + urlmenu_ip = os.environ['OGAGENTCFG_URLMENU_IP'] or '192.168.2.1' + urlmenu_port = os.environ['OGAGENTCFG_URLMENU_PORT'] or '8443' + ogcore_ip_port = ':'.join (map (str, filter (None, [ogcore_ip, ogcore_port ]))) + urlmenu_ip_port = ':'.join (map (str, filter (None, [urlmenu_ip, urlmenu_port]))) try: url = self.service.config.get (self.name, 'remote') loglevel = self.service.config.get (self.name, 'log') self.pathinterface = self.service.config.get (self.name, 'pathinterface') self.urlMenu = self.service.config.get (self.name, 'urlMenu') self.urlMsg = self.service.config.get (self.name, 'urlMsg') + + url = url.format (ogcore_scheme, ogcore_ip_port) + self.urlMenu = self.urlMenu.format (urlmenu_scheme, urlmenu_ip_port) except NoOptionError as e: logger.error ("Configuration error: {}".format (e)) raise e -- 2.40.1 From 2f4ade71ddcf57b73710e60fa5785fdf4b5d8266 Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Mon, 18 Nov 2024 09:36:29 +0100 Subject: [PATCH 10/19] refs #1112 avoid KeyErrors --- src/opengnsys/modules/server/OpenGnSys/__init__.py | 10 +++++----- src/opengnsys/workers/oglive_worker.py | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/opengnsys/modules/server/OpenGnSys/__init__.py b/src/opengnsys/modules/server/OpenGnSys/__init__.py index 6b63be0..b614cb2 100644 --- a/src/opengnsys/modules/server/OpenGnSys/__init__.py +++ b/src/opengnsys/modules/server/OpenGnSys/__init__.py @@ -117,11 +117,11 @@ class OpenGnSysWorker(ServerWorker): self.random = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(self.length)) # Ensure cfg has required configuration variables or an exception will be thrown - ogcore_scheme = os.environ['OGAGENTCFG_OGCORE_SCHEME'] or 'https' - ogcore_ip = os.environ['OGAGENTCFG_OGCORE_IP'] or '192.168.2.1' - ogcore_port = os.environ['OGAGENTCFG_OGCORE_PORT'] or '8443' - ogcore_altip = os.environ['OGAGENTCFG_OGCORE_ALTIP'] or '192.168.2.254' - ogcore_altport = os.environ['OGAGENTCFG_OGCORE_ALTPORT'] or '8443' + ogcore_scheme = os.environ.get ('OGAGENTCFG_OGCORE_SCHEME', 'https') + ogcore_ip = os.environ.get ('OGAGENTCFG_OGCORE_IP', '192.168.2.1') + ogcore_port = os.environ.get ('OGAGENTCFG_OGCORE_PORT', '8443') + ogcore_altip = os.environ.get ('OGAGENTCFG_OGCORE_ALTIP', '192.168.2.254') + ogcore_altport = os.environ.get ('OGAGENTCFG_OGCORE_ALTPORT', '8443') ogcore_ip_port = ':'.join (map (str, filter (None, [ogcore_ip, ogcore_port ]))) ogcore_altip_port = ':'.join (map (str, filter (None, [ogcore_altip, ogcore_altport ]))) try: diff --git a/src/opengnsys/workers/oglive_worker.py b/src/opengnsys/workers/oglive_worker.py index 1800a24..01dd0c4 100644 --- a/src/opengnsys/workers/oglive_worker.py +++ b/src/opengnsys/workers/oglive_worker.py @@ -366,12 +366,12 @@ class ogLiveWorker(ServerWorker): self.idaula = None ## Identificador del aula self.q = None ## for passing PIDs around - ogcore_scheme = os.environ['OGAGENTCFG_OGCORE_SCHEME'] or 'https' - ogcore_ip = os.environ['OGAGENTCFG_OGCORE_IP'] or '192.168.2.1' - ogcore_port = os.environ['OGAGENTCFG_OGCORE_PORT'] or '8443' - urlmenu_scheme = os.environ['OGAGENTCFG_URLMENU_SCHEME'] or 'https' - urlmenu_ip = os.environ['OGAGENTCFG_URLMENU_IP'] or '192.168.2.1' - urlmenu_port = os.environ['OGAGENTCFG_URLMENU_PORT'] or '8443' + ogcore_scheme = os.environ.get ('OGAGENTCFG_OGCORE_SCHEME', 'https') + ogcore_ip = os.environ.get ('OGAGENTCFG_OGCORE_IP', '192.168.2.1') + ogcore_port = os.environ.get ('OGAGENTCFG_OGCORE_PORT', '8443') + urlmenu_scheme = os.environ.get ('OGAGENTCFG_URLMENU_SCHEME', 'https') + urlmenu_ip = os.environ.get ('OGAGENTCFG_URLMENU_IP', '192.168.2.1') + urlmenu_port = os.environ.get ('OGAGENTCFG_URLMENU_PORT', '8443') ogcore_ip_port = ':'.join (map (str, filter (None, [ogcore_ip, ogcore_port ]))) urlmenu_ip_port = ':'.join (map (str, filter (None, [urlmenu_ip, urlmenu_port]))) try: -- 2.40.1 From 831da3a053f161f15fd15166f30eaff8a4ffe0e5 Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Mon, 18 Nov 2024 12:15:10 +0100 Subject: [PATCH 11/19] refs #1112 avoid KeyErrors --- linux/debian/changelog | 6 ++++++ src/VERSION | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/linux/debian/changelog b/linux/debian/changelog index 7a44674..583fea3 100644 --- a/linux/debian/changelog +++ b/linux/debian/changelog @@ -1,3 +1,9 @@ +ogagent (1.4.5~pre5-1) stable; urgency=medium + + * Avoid some KeyErrors + + -- OpenGnsys developers Mon, 18 Nov 2024 12:14:27 +0100 + ogagent (1.4.5~pre4-1) stable; urgency=medium * Don't die when ogcore returns HTTP 4xx or 5xx diff --git a/src/VERSION b/src/VERSION index e2a171d..762fdb3 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.4.5-pre4 +1.4.5-pre5 -- 2.40.1 From 7293aee3eaf263ed7fa3d2d340cfcec52e062e02 Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Wed, 20 Nov 2024 13:44:57 +0100 Subject: [PATCH 12/19] refs #1112 do not use envvars for the operating-system module --- src/cfg/ogagent.cfg | 4 ++-- .../modules/server/CloningEngine/__init__.py | 5 ++++- src/opengnsys/modules/server/OpenGnSys/__init__.py | 13 +------------ 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/src/cfg/ogagent.cfg b/src/cfg/ogagent.cfg index 2d53f6c..02ca837 100644 --- a/src/cfg/ogagent.cfg +++ b/src/cfg/ogagent.cfg @@ -7,9 +7,9 @@ port=8000 #path=test_modules/server,more_modules/server # Remote OpenGnsys Service -remote={}://{}/opengnsys/rest +remote=https://192.168.2.1/opengnsys/rest # Alternate OpenGnsys Service (comment out to enable this option) -#altremote={}://{}/opengnsys/rest +#altremote=https://10.0.2.2/opengnsys/rest # Execution level (permitted operations): status, halt, full level=full diff --git a/src/opengnsys/modules/server/CloningEngine/__init__.py b/src/opengnsys/modules/server/CloningEngine/__init__.py index 8eed397..ceb1d20 100644 --- a/src/opengnsys/modules/server/CloningEngine/__init__.py +++ b/src/opengnsys/modules/server/CloningEngine/__init__.py @@ -146,7 +146,10 @@ class CloningEngineWorker (ogLiveWorker): self.muestraMensaje (3) try: - self.interfaceAdmin (nfn, [dsk, par, nci, ipr, ptc]) + ## the ptc.split() is useless right now, since interfaceAdmin() does ' '.join(params) in order to spawn a shell + ## however we're going to need it in the future (when everything gets translated into python), plus it's harmless now. So let's do it + #self.interfaceAdmin (nfn, [dsk, par, nci, ipr, ptc]) + self.interfaceAdmin (nfn, [dsk, par, nci, ipr] + ptc.split()) self.muestraMensaje (11) herror = 0 except: diff --git a/src/opengnsys/modules/server/OpenGnSys/__init__.py b/src/opengnsys/modules/server/OpenGnSys/__init__.py index b614cb2..259544a 100644 --- a/src/opengnsys/modules/server/OpenGnSys/__init__.py +++ b/src/opengnsys/modules/server/OpenGnSys/__init__.py @@ -115,18 +115,9 @@ class OpenGnSysWorker(ServerWorker): t = 0 # Count of time # Generate random secret to send on activation self.random = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(self.length)) - # Ensure cfg has required configuration variables or an exception will be thrown - ogcore_scheme = os.environ.get ('OGAGENTCFG_OGCORE_SCHEME', 'https') - ogcore_ip = os.environ.get ('OGAGENTCFG_OGCORE_IP', '192.168.2.1') - ogcore_port = os.environ.get ('OGAGENTCFG_OGCORE_PORT', '8443') - ogcore_altip = os.environ.get ('OGAGENTCFG_OGCORE_ALTIP', '192.168.2.254') - ogcore_altport = os.environ.get ('OGAGENTCFG_OGCORE_ALTPORT', '8443') - ogcore_ip_port = ':'.join (map (str, filter (None, [ogcore_ip, ogcore_port ]))) - ogcore_altip_port = ':'.join (map (str, filter (None, [ogcore_altip, ogcore_altport ]))) try: url = self.service.config.get(self.name, 'remote') - url = url.format (ogcore_scheme, ogcore_ip_port) except NoOptionError as e: logger.error("Configuration error: {}".format(e)) raise e @@ -170,9 +161,7 @@ class OpenGnSysWorker(ServerWorker): logger.warn (str (e)) # Trying to initialize on alternative server, if defined # (used in "exam mode" from the University of Seville) - alturl = self.service.config.get(self.name, 'altremote') - alturl = alturl.format (ogcore_scheme, ogcore_altip_port) - self.REST = REST(alturl) + self.REST = REST(self.service.config.get(self.name, 'altremote')) self.REST.sendMessage('ogagent/started', {'mac': self.interface.mac, 'ip': self.interface.ip, 'secret': self.random, 'ostype': operations.os_type, 'osversion': operations.os_version, 'alt_url': True, -- 2.40.1 From a00fbcb76e293e0faecd117e5d4705299a47cb11 Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Wed, 20 Nov 2024 13:46:12 +0100 Subject: [PATCH 13/19] refs #1112 do not use envvars for the operating-system module --- linux/debian/changelog | 6 ++++++ src/VERSION | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/linux/debian/changelog b/linux/debian/changelog index 583fea3..21abbf0 100644 --- a/linux/debian/changelog +++ b/linux/debian/changelog @@ -1,3 +1,9 @@ +ogagent (1.4.5~pre6-1) stable; urgency=medium + + * Do not use envvars for the operating-system module + + -- OpenGnsys developers Wed, 20 Nov 2024 13:45:21 +0100 + ogagent (1.4.5~pre5-1) stable; urgency=medium * Avoid some KeyErrors diff --git a/src/VERSION b/src/VERSION index 762fdb3..95708d6 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.4.5-pre5 +1.4.5-pre6 -- 2.40.1 From 74a693750116cc151a06b40c424f37b8588bd177 Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Wed, 20 Nov 2024 14:25:14 +0100 Subject: [PATCH 14/19] refs #1112 use old browser again --- linux/debian/changelog | 6 ++++++ src/VERSION | 2 +- src/opengnsys/workers/oglive_worker.py | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/linux/debian/changelog b/linux/debian/changelog index 21abbf0..32aa577 100644 --- a/linux/debian/changelog +++ b/linux/debian/changelog @@ -1,3 +1,9 @@ +ogagent (1.4.5~pre7-1) stable; urgency=medium + + * Use old browser again + + -- OpenGnsys developers Wed, 20 Nov 2024 14:24:44 +0100 + ogagent (1.4.5~pre6-1) stable; urgency=medium * Do not use envvars for the operating-system module diff --git a/src/VERSION b/src/VERSION index 95708d6..a70e11e 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.4.5-pre6 +1.4.5-pre7 diff --git a/src/opengnsys/workers/oglive_worker.py b/src/opengnsys/workers/oglive_worker.py index 01dd0c4..b0e470c 100644 --- a/src/opengnsys/workers/oglive_worker.py +++ b/src/opengnsys/workers/oglive_worker.py @@ -304,9 +304,9 @@ class ogLiveWorker(ServerWorker): def cargaPaginaWeb (self, url=None): if (not url): url = self.urlMenu - os.system ('pkill -9 OGBrowser') + os.system ('pkill -9 browser') - p = subprocess.Popen (['/usr/bin/OGBrowser', '-qws', url]) + p = subprocess.Popen (['/usr/bin/browser', '-qws', url]) try: p.wait (2) ## if the process dies before 2 seconds... logger.error ('Error al ejecutar la llamada a la interface de administración') -- 2.40.1 From 7f45c3083d8580f9fba083389c6bdf51921bb3e0 Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Wed, 27 Nov 2024 20:03:58 +0100 Subject: [PATCH 15/19] refs #1112 implement Configurar() --- linux/debian/changelog | 6 +++++ src/VERSION | 2 +- .../modules/server/CloningEngine/__init__.py | 22 +++++++++++++++++-- src/opengnsys/workers/oglive_worker.py | 6 +++-- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/linux/debian/changelog b/linux/debian/changelog index 32aa577..1030976 100644 --- a/linux/debian/changelog +++ b/linux/debian/changelog @@ -1,3 +1,9 @@ +ogagent (1.4.5~pre8-1) stable; urgency=medium + + * Add Configurar() to the CloningEngine module + + -- OpenGnsys developers Wed, 27 Nov 2024 20:02:42 +0100 + ogagent (1.4.5~pre7-1) stable; urgency=medium * Use old browser again diff --git a/src/VERSION b/src/VERSION index a70e11e..d051a1b 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.4.5-pre7 +1.4.5-pre8 diff --git a/src/opengnsys/modules/server/CloningEngine/__init__.py b/src/opengnsys/modules/server/CloningEngine/__init__.py index ceb1d20..957598d 100644 --- a/src/opengnsys/modules/server/CloningEngine/__init__.py +++ b/src/opengnsys/modules/server/CloningEngine/__init__.py @@ -190,13 +190,31 @@ class CloningEngineWorker (ogLiveWorker): nfn = post_params['nfn'] dsk = post_params['dsk'] - cfg = post_params['cfg'].replace('\n','$').replace('\t','#') + cfg = post_params['cfg'] ids = post_params['ids'] self.muestraMensaje (4) + params = [] + disk_info = cfg.pop (0) + logger.debug (f'disk_info ({disk_info})') + for k in ['dis', 'che', 'tch']: + params.append (f'{k}={disk_info[k]}') + disk_info_str = '*'.join (params) + + partitions = [] + for entry in cfg: + logger.debug (f'entry ({entry})') + params = [] + for k in ['par', 'cpt', 'sfi', 'tam', 'ope']: + params.append (f'{k}={entry[k]}') + partitions.append ('*'.join (params)) + part_info_str = '%'.join (partitions) + + cfg_str = f'{disk_info_str}!{part_info_str}%' + try: - self.interfaceAdmin (nfn, [dsk, cfg]) + self.interfaceAdmin (nfn, ['ignored', cfg_str]) self.muestraMensaje (14) herror = 0 except: diff --git a/src/opengnsys/workers/oglive_worker.py b/src/opengnsys/workers/oglive_worker.py index b0e470c..d7da9c1 100644 --- a/src/opengnsys/workers/oglive_worker.py +++ b/src/opengnsys/workers/oglive_worker.py @@ -225,8 +225,9 @@ class ogLiveWorker(ServerWorker): p = subprocess.Popen (proc, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if self.q: self.q.put (p.pid) - else: - logger.debug ('oops, queremos escribir el PID del hijo a la cola pero no hay cola') + #else: + # ## sale este mensaje en el log, y no se por que + # logger.debug ('oops, queremos escribir el PID del hijo a la cola pero no hay cola') sout = serr = '' while p.poll() is None: for l in iter (p.stdout.readline, b''): sout += l.decode ('utf-8', 'ignore') @@ -435,6 +436,7 @@ class ogLiveWorker(ServerWorker): ## ## está sin probar. Simplemente probé que el agente arranca (o sea, que no lo rompí con estos cambios) ## versión 1.4.5-pre3, desplegada en entornos de desarrollo y funciona bien +## (aunque sale el mensaje de "oops, queremos escribir el PID del hijo a la cola pero no hay cola", no sé por qué. Lo comento para que no despiste a la gente) ## la idea sería mandarle un EjecutarScript 'sleep 30' y luego un KillJob ## ## $ curl --insecure -X POST --data '{"nfn":"EjecutarScript","scp":"cd /usr; sleep 30; pwd; ls","ids":"0"}' https://192.168.2.199:8000/ogAdmClient/EjecutarScript -- 2.40.1 From be1fd7d624efaafbb7d11bf2f5efeb3c6ce53f14 Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Thu, 28 Nov 2024 10:18:34 +0100 Subject: [PATCH 16/19] refs #1112 fix bug while accessing object member --- src/opengnsys/workers/oglive_worker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/opengnsys/workers/oglive_worker.py b/src/opengnsys/workers/oglive_worker.py index d7da9c1..5eaae01 100644 --- a/src/opengnsys/workers/oglive_worker.py +++ b/src/opengnsys/workers/oglive_worker.py @@ -190,8 +190,8 @@ class ogLiveWorker(ServerWorker): logger.debug (f'considering thread ({k})') if self.q: - if not q.empty(): - elem['child_pid'] = q.get() + if not self.q.empty(): + elem['child_pid'] = self.q.get() logger.debug (f'queue not empty, got pid ({elem["child_pid"]})') else: logger.debug (f'queue empty') @@ -413,7 +413,7 @@ class ogLiveWorker(ServerWorker): import queue self.q = queue.Queue() ## a single queue works for us because we never have more than one long_running_job at the same time self.thread_list[job_id] = { - 'thread': ThreadWithResult (target=f, args=args), + 'thread': ThreadWithResult (target=f, args=args), ## tengo que pasar self.q aqui dentro de args? 'starttime': time.time(), 'child_pid': None, 'running': True, -- 2.40.1 From 69be238f9fd45e9d0a8305ea0241d08ac2e28cfd Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Fri, 29 Nov 2024 10:24:15 +0100 Subject: [PATCH 17/19] refs #1108 kill subprocesses in oglive --- linux/debian/changelog | 8 ++- src/VERSION | 2 +- .../modules/server/CloningEngine/__init__.py | 3 + .../modules/server/ogAdmClient/__init__.py | 3 + src/opengnsys/workers/oglive_worker.py | 67 ++++++++++--------- 5 files changed, 48 insertions(+), 35 deletions(-) diff --git a/linux/debian/changelog b/linux/debian/changelog index 1030976..b84d33d 100644 --- a/linux/debian/changelog +++ b/linux/debian/changelog @@ -1,3 +1,9 @@ +ogagent (1.4.5-1) stable; urgency=medium + + * Kill long running jobs in oglive + + -- OpenGnsys developers Fri, 29 Nov 2024 10:22:36 +0100 + ogagent (1.4.5~pre8-1) stable; urgency=medium * Add Configurar() to the CloningEngine module @@ -31,7 +37,7 @@ ogagent (1.4.5~pre4-1) stable; urgency=medium ogagent (1.4.5~pre3-1) stable; urgency=medium - * Kill long running jobs in oglive + * Kill long running jobs in oglive (not-yet-working draft) -- OpenGnsys developers Wed, 06 Nov 2024 14:11:32 +0100 diff --git a/src/VERSION b/src/VERSION index d051a1b..e516bb9 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.4.5-pre8 +1.4.5 diff --git a/src/opengnsys/modules/server/CloningEngine/__init__.py b/src/opengnsys/modules/server/CloningEngine/__init__.py index 957598d..67ce077 100644 --- a/src/opengnsys/modules/server/CloningEngine/__init__.py +++ b/src/opengnsys/modules/server/CloningEngine/__init__.py @@ -349,4 +349,7 @@ class CloningEngineWorker (ogLiveWorker): logger.debug ('in process_KillJob, path "{}" get_params "{}" post_params "{}" server "{}"'.format (path, get_params, post_params, server)) jid = post_params['job_id'] r = self.killer (jid) + logger.debug (f'r bef ({r})') + r.update ({ 'nfn':'RESPUESTA_KillJob', 'job':jid }) + logger.debug (f'r aft ({r})') return r diff --git a/src/opengnsys/modules/server/ogAdmClient/__init__.py b/src/opengnsys/modules/server/ogAdmClient/__init__.py index f08fc16..1cf33a5 100644 --- a/src/opengnsys/modules/server/ogAdmClient/__init__.py +++ b/src/opengnsys/modules/server/ogAdmClient/__init__.py @@ -567,4 +567,7 @@ class ogAdmClientWorker (ogLiveWorker): logger.debug ('in process_KillJob, path "{}" get_params "{}" post_params "{}" server "{}"'.format (path, get_params, post_params, server)) jid = post_params['job_id'] r = self.killer (jid) + logger.debug (f'r bef ({r})') + r.update ({ 'nfn':'RESPUESTA_KillJob', 'job':jid }) + logger.debug (f'r aft ({r})') return r diff --git a/src/opengnsys/workers/oglive_worker.py b/src/opengnsys/workers/oglive_worker.py index 5eaae01..faaeaf5 100644 --- a/src/opengnsys/workers/oglive_worker.py +++ b/src/opengnsys/workers/oglive_worker.py @@ -48,13 +48,16 @@ class ThreadWithResult (threading.Thread): try: self.result = None if self._target is not None: + ## the first arg in self._args is the queue + self.q = self._args[0] + self._args = self._args[1:] try: self.result = self._target (*self._args, **self._kwargs) except Exception as e: self.result = { 'res': 2, 'der': f'got exception: ({e})' } ## res=2 as defined in ogAdmClient.c:2048 finally: # Avoid a refcycle if the thread is running a function with an argument that has a member that points to the thread. - del self._target, self._args, self._kwargs + del self._target, self._args, self._kwargs, self.q class ogLiveWorker(ServerWorker): thread_list = {} @@ -150,36 +153,45 @@ class ogLiveWorker(ServerWorker): with self.thread_lock: if 'thread' not in self.thread_list[job_id]: return { 'res': 2, 'der': 'Job is not running' } - t = self.thread_list[job_id]['thread'] + t = self.thread_list[job_id]['thread'] pid = self.thread_list[job_id]['child_pid'] logger.debug (f'pid ({pid})') try_times = 8 sig = signal.SIGTERM + msg = f'could not kill pid ({pid}) after ({try_times}) tries' + success = 2 ## mimic cmd['res'] in respuestaEjecucionComando(): "1" means success, "2" means failed while True: t.join (0.05) if not t.is_alive(): - logger.debug (f'thread exited, yay!') - ## limpieza - self.q = None + msg = 'job terminated' + success = 1 + logger.debug (msg) self.thread_list[job_id]['child_pid'] = None break + ## race condition: if the subprocess finishes just here, then we already checked that t.is_alive() is true, but os.path.exists(/proc/pid) will be false below. msg will be 'nothing to kill'. + ## this is fine in the first iteration of the loop, before we send any signals. In the rest of iterations, after some signals were sent, msg should be 'job terminated' instead. if pid: if os.path.exists (f'/proc/{pid}'): - logger.debug (f'would os.kill pid ({pid})') - ## si el proceso se muere justo aquí, no pasa nada: la señal va al vacío - #os.kill (pid, sig) + logger.debug (f'sending signal ({sig}) to pid ({pid})') + ## if the process finishes just here, nothing happens: the signal is sent to the void + os.kill (pid, sig) + #subprocess.run (['kill', '--signal', str(sig), str(pid)]) else: - logger.debug (f'pid ({pid}) is gone, nothing to kill...') + msg = f'pid ({pid}) is gone, nothing to kill' + success = 1 + logger.debug (msg) self.thread_list[job_id]['child_pid'] = None + break else: - logger.debug (f'oops no tenemos a quien matar') + msg = 'no PID to kill' + logger.debug (msg) + if not try_times: break if 4 == try_times: sig = signal.SIGKILL ## change signal after a few tries try_times -= 1 time.sleep (0.4) - ## y si no lo hemos conseguido, qué?? - return {'job_id':job_id} + return { 'res':success, 'der':msg } def mon (self): while True: @@ -221,13 +233,12 @@ class ogLiveWorker(ServerWorker): proc = ['bash', '-c', '{} {}'.format (devel_bash_prefix, exe)] logger.debug ('subprocess.run ("{}", capture_output=True)'.format (proc)) - #p = subprocess.run (proc, capture_output=True) p = subprocess.Popen (proc, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if self.q: self.q.put (p.pid) - #else: - # ## sale este mensaje en el log, y no se por que - # logger.debug ('oops, queremos escribir el PID del hijo a la cola pero no hay cola') + else: + ## esto sucede por ejemplo cuando arranca el agente, que estamos en interfaceAdmin() en el mismo hilo, sin _long_running_job ni hilo separado + logger.debug ('no queue--not writing any PID to it') sout = serr = '' while p.poll() is None: for l in iter (p.stdout.readline, b''): sout += l.decode ('utf-8', 'ignore') @@ -299,7 +310,7 @@ class ogLiveWorker(ServerWorker): cmd['der'] = '' else: ## el comando tuvo algún error cmd['res'] = 2 - cmd['der'] = self.tbErroresScripts[herror] ## XXX + cmd['der'] = self.tbErroresScripts[herror] return cmd @@ -413,7 +424,7 @@ class ogLiveWorker(ServerWorker): import queue self.q = queue.Queue() ## a single queue works for us because we never have more than one long_running_job at the same time self.thread_list[job_id] = { - 'thread': ThreadWithResult (target=f, args=args), ## tengo que pasar self.q aqui dentro de args? + 'thread': ThreadWithResult (target=f, args=(self.q,) + args), 'starttime': time.time(), 'child_pid': None, 'running': True, @@ -426,27 +437,17 @@ class ogLiveWorker(ServerWorker): ## - aqui en _long_running_job meto una cola en self.q ## - (self.q fue inicializado a None al instanciar el objeto, para evitar error "objeto no tiene 'q'") ## - en el thread_list también tengo un child_pid para almacenar el pid de los procesos hijos -## - en interfaceAdm() al hacer subprocess.Popen(), recojo el pid y lo escribo en la queue +## - al crear el ThreadWithResult le paso la cola, y luego en run() la recojo y la meto en el self.q del thread +## - en interfaceAdmin() al hacer subprocess.Popen(), recojo el pid y lo escribo en la queue ## - en mon() recojo pids de la queue y los meto en thread_list 'child_pid' -## - algunas funciones llaman a interfaceAdm más de una vez, y escriben más de un pid en la cola, y en mon() voy recogiendo y actualizando -## - por ejemplo EjecutarScript llama a interfaceAdm() y luego llama a LeeConfiguracion() el cual llama a interfaceAdm() otra vez +## - algunas funciones llaman a interfaceAdmin más de una vez, y escriben más de un pid en la cola, y en mon() voy recogiendo y actualizando +## - por ejemplo EjecutarScript llama a interfaceAdmin() y luego llama a LeeConfiguracion() el cual llama a interfaceAdmin() otra vez ## - y cuando nos llamen a KillJob, terminamos en killer() el cual coge el 'child_pid' y zas ## - pero a lo mejor el child ya terminó ## - o a lo mejor el KillJob nos llegó demasiado pronto y todavía no hubo ningún child ## -## está sin probar. Simplemente probé que el agente arranca (o sea, que no lo rompí con estos cambios) -## versión 1.4.5-pre3, desplegada en entornos de desarrollo y funciona bien -## (aunque sale el mensaje de "oops, queremos escribir el PID del hijo a la cola pero no hay cola", no sé por qué. Lo comento para que no despiste a la gente) -## la idea sería mandarle un EjecutarScript 'sleep 30' y luego un KillJob -## ## $ curl --insecure -X POST --data '{"nfn":"EjecutarScript","scp":"cd /usr; sleep 30; pwd; ls","ids":"0"}' https://192.168.2.199:8000/ogAdmClient/EjecutarScript ## {"job_id": "EjecutarScript-333feb3f"} ## $ curl --insecure -X POST --data '{"job_id":"EjecutarScript-333feb3f"}' https://192.168.2.199:8000/CloningEngine/KillJob ## -## el KillJob de primeras no va a hacer nada (la llamada a os.kill() está comentada) -## entonces sería probar primero que el flujo va como espero: -## - que primero salga "would kill pid" en el log -## - y pasados unos segundos, al llamar a KillJob otra vez, salga "pid is gone" -## o en otra prueba, metiéndole un time.sleep() a piñón en interfaceAdm() antes de lanzar un hijo -## - que salga "oops no tenemos a quien matar" en el log -## y ya con esto comprobado, descomentar el os.kill() y hacer pruebas reales +## funciona bien, excepto que el PID no muere xD, ni siquiera haciendo subprocess.run('kill') -- 2.40.1 From 173379f99a34f2345fbf61234a0f31de88af3f3e Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Tue, 14 Jan 2025 11:59:15 +0100 Subject: [PATCH 18/19] refs #1338 change menubrowser URL --- src/cfg/ogagent.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cfg/ogagent.cfg b/src/cfg/ogagent.cfg index 02ca837..c07e32a 100644 --- a/src/cfg/ogagent.cfg +++ b/src/cfg/ogagent.cfg @@ -26,12 +26,12 @@ log=DEBUG remote={}://{}/opengnsys/rest log=DEBUG pathinterface=/opt/opengnsys/interfaceAdm -urlMenu={}://{}/opengnsys/varios/menubrowser.php +urlMenu={}://{}/menu-browser urlMsg=http://localhost/cgi-bin/httpd-log.sh [CloningEngine] remote={}://{}/opengnsys/rest log=DEBUG pathinterface=/opt/opengnsys/interfaceAdm -urlMenu={}://{}/opengnsys/varios/menubrowser.php +urlMenu={}://{}/menu-browser urlMsg=http://localhost/cgi-bin/httpd-log.sh -- 2.40.1 From 14e893a21e6e0728e6d2de006dffd43d6568290e Mon Sep 17 00:00:00 2001 From: Natalia Serrano Date: Tue, 14 Jan 2025 12:01:36 +0100 Subject: [PATCH 19/19] refs #1338 bump version --- linux/debian/changelog | 6 ++++++ src/VERSION | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/linux/debian/changelog b/linux/debian/changelog index b84d33d..5360d8e 100644 --- a/linux/debian/changelog +++ b/linux/debian/changelog @@ -1,3 +1,9 @@ +ogagent (1.4.6-1) UNRELEASED; urgency=medium + + * Point to the new menu browser + + -- OpenGnsys developers Tue, 14 Jan 2025 12:00:24 +0100 + ogagent (1.4.5-1) stable; urgency=medium * Kill long running jobs in oglive diff --git a/src/VERSION b/src/VERSION index e516bb9..c514bd8 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -1.4.5 +1.4.6 -- 2.40.1