Recovery improvements

master
Ken Subratie 2023-11-06 18:38:38 -05:00
parent dd5ca986ef
commit 23684083e0
14 changed files with 208 additions and 223 deletions

View File

@ -63,7 +63,6 @@ __all__ = [
"GENEVE_SETUP_TIMEOUT", "GENEVE_SETUP_TIMEOUT",
"MIN_SUCCESSORS", "MIN_SUCCESSORS",
"MAX_ON_DEMAND_EDGES", "MAX_ON_DEMAND_EDGES",
"PEER_DISCOVERY_COALESCE",
"EXCLUSION_BASE_INTERVAL", "EXCLUSION_BASE_INTERVAL",
"MAX_SUCCESSIVE_FAILS", "MAX_SUCCESSIVE_FAILS",
"TRIM_CHECK_INTERVAL", "TRIM_CHECK_INTERVAL",
@ -121,7 +120,6 @@ SDN_CONTROLLER_PORT: Literal[6633] = 6633
GENEVE_SETUP_TIMEOUT: Literal[180] = 180 GENEVE_SETUP_TIMEOUT: Literal[180] = 180
MIN_SUCCESSORS: Literal[2] = 2 MIN_SUCCESSORS: Literal[2] = 2
MAX_ON_DEMAND_EDGES: Literal[3] = 3 MAX_ON_DEMAND_EDGES: Literal[3] = 3
PEER_DISCOVERY_COALESCE: Literal[1] = 1
EXCLUSION_BASE_INTERVAL: Literal[60] = 60 EXCLUSION_BASE_INTERVAL: Literal[60] = 60
MAX_SUCCESSIVE_FAILS: Literal[4] = 4 MAX_SUCCESSIVE_FAILS: Literal[4] = 4
TRIM_CHECK_INTERVAL: Literal[300] = 300 TRIM_CHECK_INTERVAL: Literal[300] = 300

View File

@ -205,8 +205,6 @@ class Broker:
self._que_listener.start() self._que_listener.start()
for k, v in self.cfg_controllers.items(): for k, v in self.cfg_controllers.items():
ctr_lgl = self._config["Broker"].get("LogLevel", LOG_LEVEL) ctr_lgl = self._config["Broker"].get("LogLevel", LOG_LEVEL)
if "LogLevel" in self._controller_config(k):
ctr_lgl = self._controller_config(k)["LogLevel"]
self._setup_controller_logger((k, v["Module"]), formatter, ctr_lgl) self._setup_controller_logger((k, v["Module"]), formatter, ctr_lgl)
def _setup_controller_logger( def _setup_controller_logger(
@ -250,8 +248,8 @@ class Broker:
for ctrl_name in self._load_order: for ctrl_name in self._load_order:
self.load_module(ctrl_name) self.load_module(ctrl_name)
self._ipc.start()
self._timers.start() self._timers.start()
self._ipc.start()
# intialize the the CMs via their respective nexus # intialize the the CMs via their respective nexus
for ctrl_name in self._load_order: for ctrl_name in self._load_order:
self._nexus_map[ctrl_name].initialize() self._nexus_map[ctrl_name].initialize()
@ -368,8 +366,6 @@ class Broker:
def terminate(self): def terminate(self):
with self._nexus_lock: with self._nexus_lock:
self._timers.terminate()
self._ipc.terminate()
for ctrl_name in reversed(self._load_order): for ctrl_name in reversed(self._load_order):
wn = self._nexus_map[ctrl_name]._cm_thread.name wn = self._nexus_map[ctrl_name]._cm_thread.name
self._nexus_map[ctrl_name].work_queue.put(None) self._nexus_map[ctrl_name].work_queue.put(None)
@ -377,6 +373,8 @@ class Broker:
wn = self._nexus_map[ctrl_name]._cm_thread.name wn = self._nexus_map[ctrl_name]._cm_thread.name
self._nexus_map[ctrl_name]._cm_thread.join() self._nexus_map[ctrl_name]._cm_thread.join()
self.logger.info("%s exited", wn) self.logger.info("%s exited", wn)
self._ipc.terminate()
self._timers.terminate()
for ql in self._cm_qlisteners: for ql in self._cm_qlisteners:
ql.stop() ql.stop()
self._que_listener.stop() self._que_listener.stop()
@ -505,6 +503,9 @@ class Broker:
def register_timed_transaction(self, entry: Transaction): def register_timed_transaction(self, entry: Transaction):
self._timers.register(entry) self._timers.register(entry)
def register_dpc(self, delay, call, params=()):
self._timers.register_dpc(delay, call, params)
def dispach_proxy_msg(self, msg: ProxyMsg): def dispach_proxy_msg(self, msg: ProxyMsg):
# task structure # task structure
# dict(Request=dict(Target=CM, Action=None, Params=None), # dict(Request=dict(Target=CM, Action=None, Params=None),

View File

@ -90,9 +90,9 @@ class CBT:
return introspect(self) return introspect(self)
def __itr__(self): def __itr__(self):
yield ("status", self.status)
yield ("initiator", self.initiator) yield ("initiator", self.initiator)
yield ("recipient", self.recipient) yield ("recipient", self.recipient)
yield ("status", self.status)
yield ("data", self.data) yield ("data", self.data)
def update(self, data, status: bool): def update(self, data, status: bool):
@ -116,7 +116,10 @@ class CBT:
self.response: Optional[CBT.Response] = None self.response: Optional[CBT.Response] = None
self.context: dict = {} self.context: dict = {}
for k, v in kwargs.items(): for k, v in kwargs.items():
self.context[k] = v if hasattr(self, k):
self.k = v
else:
self.context[k] = v
self.lifespan = CBT_LIFESPAN self.lifespan = CBT_LIFESPAN
self.time_created: float = 0.0 self.time_created: float = 0.0
self.time_submited: float = 0.0 self.time_submited: float = 0.0

View File

@ -50,9 +50,9 @@ class ControllerModule:
def _setup_logger(self): def _setup_logger(self):
self.logger = logging.getLogger(self.__class__.__name__) self.logger = logging.getLogger(self.__class__.__name__)
# if "LogLevel" in self.config: if "LogLevel" in self.config:
# self.logger.setLevel(self.config["LogLevel"]) self.logger.setLevel(self.config["LogLevel"])
# return return
@abstractmethod @abstractmethod
def initialize(self): def initialize(self):
@ -260,6 +260,9 @@ class ControllerModule:
raise ValueError(f"Object already marked as completed {obj}") raise ValueError(f"Object already marked as completed {obj}")
self._nexus.register_timed_transaction(obj, is_completed, on_expired, lifespan) self._nexus.register_timed_transaction(obj, is_completed, on_expired, lifespan)
def register_deferred_call(self, delay, call, params=()):
self._nexus.register_deferred_call(delay, call, params)
@abstractmethod @abstractmethod
def handle_ipc(self, msg: ProxyMsg): def handle_ipc(self, msg: ProxyMsg):
NotImplemented NotImplemented

View File

@ -23,7 +23,7 @@ import queue
import threading import threading
import time import time
from . import CONTROLLER_TIMER_INTERVAL, EVENT_PERIOD, statement_false from . import CONTROLLER_TIMER_INTERVAL, EVENT_PERIOD
from .cbt import CBT from .cbt import CBT
from .process_proxy import ProxyMsg from .process_proxy import ProxyMsg
from .timed_transactions import Transaction from .timed_transactions import Transaction
@ -44,10 +44,6 @@ class Nexus:
) )
self._timer_loop_cnt: int = 1 self._timer_loop_cnt: int = 1
self._pending_cbts: dict[int, CBT] = {} self._pending_cbts: dict[int, CBT] = {}
self._last_ctlr_update_ts = time.time()
self._ctlr_update = Transaction(
self, statement_false, self.on_timer, self._timer_interval
)
@property @property
def controller(self): def controller(self):
@ -131,7 +127,7 @@ class Nexus:
def start_controller(self): def start_controller(self):
self._cm_thread.start() self._cm_thread.start()
self._broker.register_timed_transaction(self._ctlr_update) self._broker.register_dpc(self._timer_interval, self.on_timer)
def __worker(self): def __worker(self):
# get CBT from the local queue and call process_cbt() of the # get CBT from the local queue and call process_cbt() of the
@ -182,11 +178,7 @@ class Nexus:
f"Unexpected CBT state for expired event. {cbt}" f"Unexpected CBT state for expired event. {cbt}"
) )
def _schedule_ctlr_update(self): def on_timer(self):
self._ctlr_update.lifespan = self._timer_interval
self._broker.register_timed_transaction(self._ctlr_update)
def on_timer(self, nexus, time_expired: float):
try: try:
self._controller.log_state() self._controller.log_state()
self._controller.on_timer_event() self._controller.on_timer_event()
@ -194,7 +186,7 @@ class Nexus:
self._controller.logger.warning( self._controller.logger.warning(
"on_timer exception: %s", err, exc_info=True "on_timer exception: %s", err, exc_info=True
) )
self._schedule_ctlr_update() self._broker.register_dpc(self._timer_interval, self.on_timer)
def query_param(self, param_name=""): def query_param(self, param_name=""):
return self._broker.query_param(param_name) return self._broker.query_param(param_name)
@ -235,5 +227,8 @@ class Nexus:
) )
) )
def register_deferred_call(self, delay, call, params):
self._broker.register_dpc(delay, call, params)
def send_ipc(self, msg: ProxyMsg): def send_ipc(self, msg: ProxyMsg):
self._broker.send_ipc(msg) self._broker.send_ipc(msg)

View File

@ -53,16 +53,14 @@ class ProxyMsg:
def data(self, payload: bytearray): def data(self, payload: bytearray):
self.ts = time.time() self.ts = time.time()
self._json = None self._json = None
if payload is None:
return
self.payload = payload self.payload = payload
def __repr__(self) -> str: def __repr__(self) -> str:
return f"{self.fileno}:{self.payload.decode('utf-8')}" return f"{self.fileno}:{self.json}"
@property @property
def json(self): def json(self):
if self._json is None: if self._json is None and self.payload:
self._json = json.loads(self.payload.decode("utf-8")) self._json = json.loads(self.payload.decode("utf-8"))
return self._json return self._json
@ -124,6 +122,8 @@ class ProcessProxy:
while not self._exit_ev.is_set(): while not self._exit_ev.is_set():
while self.tx_que.qsize() > 0: while self.tx_que.qsize() > 0:
msg: ProxyMsg = self.tx_que.get_nowait() msg: ProxyMsg = self.tx_que.get_nowait()
if not msg.data:
continue
node = connections.get(msg.fileno, None) node = connections.get(msg.fileno, None)
if node is not None: if node is not None:
if not node.event & select.EPOLLOUT: if not node.event & select.EPOLLOUT:
@ -154,6 +154,7 @@ class ProcessProxy:
"Node %s IPC read hangup", node.skt.fileno() "Node %s IPC read hangup", node.skt.fileno()
) )
if not node.tx_deque: if not node.tx_deque:
connections.pop(fileno)
self.close_client(node) self.close_client(node)
elif event & select.EPOLLHUP: elif event & select.EPOLLHUP:
node = connections.pop(fileno) node = connections.pop(fileno)
@ -164,10 +165,12 @@ class ProcessProxy:
bufsz = int.from_bytes( bufsz = int.from_bytes(
connections[fileno].skt.recv(2), sys.byteorder connections[fileno].skt.recv(2), sys.byteorder
) )
if bufsz <= 0: if bufsz == 0:
node = connections[fileno] connections[fileno].skt.recv(0)
node.skt.shutdown(socket.SHUT_WR) self.logger.warning(
elif bufsz > 65507: "Zero byte read buffer size received"
)
elif bufsz < 0 or bufsz > 65507:
node = connections[fileno] node = connections[fileno]
connections.pop(fileno) connections.pop(fileno)
self.close_client(node) self.close_client(node)

View File

@ -31,7 +31,12 @@ class TimedTransactions:
def register(self, entry: Transaction): def register(self, entry: Transaction):
if self._exit_ev.is_set(): if self._exit_ev.is_set():
return return
self._sched.enter(entry.lifespan, entry.priority, self._get_expired, [entry]) self._sched.enter(entry.lifespan, entry.priority, self._get_expired, (entry,))
def register_dpc(self, delay: float, call, params: tuple):
if self._exit_ev.is_set():
return
self._sched.enter(delay, 15, call, params)
def _get_expired(self, entry): def _get_expired(self, entry):
if not entry.is_completed(): if not entry.is_completed():

View File

@ -27,6 +27,7 @@ except ImportError:
import copy import copy
import logging import logging
import signal
import subprocess import subprocess
import threading import threading
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
@ -49,6 +50,7 @@ from broker.cbt import CBT
from broker.controller_module import ControllerModule from broker.controller_module import ControllerModule
from broker.process_proxy import ProxyMsg from broker.process_proxy import ProxyMsg
from pyroute2 import IPRoute from pyroute2 import IPRoute
from pyroute2.netlink.exceptions import NetlinkError
from .tunnel import TUNNEL_EVENTS from .tunnel import TUNNEL_EVENTS
@ -171,10 +173,12 @@ class OvsBridge(BridgeABC):
def add_port(self, port_name, port_descr): def add_port(self, port_name, port_descr):
self.del_port(port_name) self.del_port(port_name)
self.flush_ip_addresses(self.name)
with IPRoute() as ipr: with IPRoute() as ipr:
idx = ipr.link_lookup(ifname=port_name)[0] idx = ipr.link_lookup(ifname=port_name)
ipr.link("set", index=idx, mtu=self.mtu) if len(idx) < 1:
raise NetlinkError(19, "No such device")
ipr.flush_addr(index=idx[0])
ipr.link("set", index=idx[0], mtu=self.mtu)
broker.run_proc( broker.run_proc(
[OvsBridge.brctl, "--may-exist", "add-port", self.name, port_name] [OvsBridge.brctl, "--may-exist", "add-port", self.name, port_name]
) )
@ -263,9 +267,12 @@ class LinuxBridge(BridgeABC):
def add_port(self, port_name, port_descr): def add_port(self, port_name, port_descr):
with IPRoute() as ipr: with IPRoute() as ipr:
idx = ipr.link_lookup(ifname=port_name)[0] idx = ipr.link_lookup(ifname=port_name)
ipr.link("set", index=idx, mtu=self.mtu) if len(idx) < 1:
ipr.link("set", index=idx, master=ipr.link_lookup(ifname=self.name)[0]) raise NetlinkError(19, "No such device")
ipr.flush_addr(index=idx[0])
ipr.link("set", index=idx[0], mtu=self.mtu)
ipr.link("set", index=idx[0], master=ipr.link_lookup(ifname=self.name)[0])
self.ports.add(port_name) self.ports.add(port_name)
self.port_descriptors[port_name] = port_descr self.port_descriptors[port_name] = port_descr
@ -307,16 +314,20 @@ class VNIC(BridgeABC):
def add_port(self, port_name): def add_port(self, port_name):
self.name = port_name self.name = port_name
with IPRoute() as ipr: with IPRoute() as ipr:
idx = ipr.link_lookup(ifname=port_name)[0] idx = ipr.link_lookup(ifname=port_name)
ipr.link("set", index=idx, mtu=self.mtu) if len(idx) < 1:
ipr.addr("add", index=idx, address=self.ip_addr, mask=self.prefix_len) raise NetlinkError(19, "No such device")
ipr.link("set", index=idx, state="up") ipr.link("set", index=idx[0], mtu=self.mtu)
ipr.addr("add", index=idx[0], address=self.ip_addr, mask=self.prefix_len)
ipr.link("set", index=idx[0], state="up")
def del_port(self, port_name): def del_port(self, port_name):
with IPRoute() as ipr: with IPRoute() as ipr:
idx = ipr.link_lookup(ifname="port_name")[0] idx = ipr.link_lookup(ifname=port_name)
ipr.link("set", index=idx, state="down") if len(idx) < 1:
ipr.link("del", index=idx) raise NetlinkError(19, "No such device")
ipr.link("set", index=idx[0], state="down")
ipr.link("del", index=idx[0])
################################################################################################### ###################################################################################################
@ -447,7 +458,7 @@ class BridgeController(ControllerModule):
net_ovl["NetDevice"] = {} net_ovl["NetDevice"] = {}
# start the BF proxy if at least one overlay is configured for it # start the BF proxy if at least one overlay is configured for it
if "BoundedFlood" in self.config: if "BoundedFlood" in self.config:
self._start_bf_proxy_server() self._start_boundedflood()
# create and configure the bridge for each overlay # create and configure the bridge for each overlay
_ = self._create_overlay_bridges() _ = self._create_overlay_bridges()
publishers = self.get_registered_publishers() publishers = self.get_registered_publishers()
@ -490,7 +501,7 @@ class BridgeController(ControllerModule):
"TOP_REQUEST_OND_TUNNEL": self.resp_handler_default, "TOP_REQUEST_OND_TUNNEL": self.resp_handler_default,
} }
def _start_bf_proxy_server(self): def _start_boundedflood(self):
bf_config = self.config["BoundedFlood"] bf_config = self.config["BoundedFlood"]
bf_config["NodeId"] = self.node_id bf_config["NodeId"] = self.node_id
bf_ovls = bf_config.pop("Overlays") bf_ovls = bf_config.pop("Overlays")
@ -502,7 +513,7 @@ class BridgeController(ControllerModule):
) )
bf_config[br_name] = bf_ovls[olid] bf_config[br_name] = bf_ovls[olid]
bf_config[br_name]["OverlayId"] = olid bf_config[br_name]["OverlayId"] = olid
self.start_bf_client_module(bf_config) self._start_bf_module(bf_config)
def _create_overlay_bridges(self) -> dict: def _create_overlay_bridges(self) -> dict:
ign_br_names = {} ign_br_names = {}
@ -548,7 +559,7 @@ class BridgeController(ControllerModule):
except Exception as err: except Exception as err:
self._tunnels[overlay_id].pop(port_name, None) self._tunnels[overlay_id].pop(port_name, None)
bridge.del_port(port_name) bridge.del_port(port_name)
self.logger.info("Failed to add port %s. %s", tnl_data, err, exc_info=True) self.logger.info("Failed to add port %s. %s", tnl_data, err)
def req_handler_manage_bridge(self, cbt: CBT): def req_handler_manage_bridge(self, cbt: CBT):
try: try:
@ -558,9 +569,12 @@ class BridgeController(ControllerModule):
if cbt.request.params["UpdateType"] == TUNNEL_EVENTS.Connected: if cbt.request.params["UpdateType"] == TUNNEL_EVENTS.Connected:
self._add_tunnel_port(olid, port_name, cbt.request.params) self._add_tunnel_port(olid, port_name, cbt.request.params)
elif cbt.request.params["UpdateType"] == TUNNEL_EVENTS.Removed: elif cbt.request.params["UpdateType"] == TUNNEL_EVENTS.Removed:
self._tunnels[olid].pop(port_name, None) if port_name:
bridge.del_port(port_name) self._tunnels[olid].pop(port_name, None)
self.logger.info("Port %s removed from bridge %s", port_name, bridge) bridge.del_port(port_name)
self.logger.info(
"Port %s removed from bridge %s", port_name, bridge
)
except Exception as err: except Exception as err:
self.logger.warning("Manage bridge error %s", err, exc_info=True) self.logger.warning("Manage bridge error %s", err, exc_info=True)
cbt.set_response(None, True) cbt.set_response(None, True)
@ -569,10 +583,17 @@ class BridgeController(ControllerModule):
def on_timer_event(self): def on_timer_event(self):
for tnl in self._tunnels.values(): for tnl in self._tunnels.values():
tnl.trim() tnl.trim()
if self._bf_proc is not None:
exit_code = self._bf_proc.poll()
if exit_code:
self.logger.info(
"BF module terminated unexpectedly (%s), restarting.", exit_code
)
self._start_boundedflood()
def terminate(self): def terminate(self):
try: try:
self.stop_bf_module() self._stop_bf_module()
for olid, bridge in self._ovl_net.items(): for olid, bridge in self._ovl_net.items():
if self.overlays[olid]["NetDevice"].get( if self.overlays[olid]["NetDevice"].get(
"AutoDelete", BRIDGE_AUTO_DELETE "AutoDelete", BRIDGE_AUTO_DELETE
@ -671,7 +692,7 @@ class BridgeController(ControllerModule):
msg.data = json.dumps(task).encode("utf-8") msg.data = json.dumps(task).encode("utf-8")
self.send_ipc(msg) self.send_ipc(msg)
def start_bf_client_module(self, bf_config): def _start_bf_module(self, bf_config):
RyuManager = spawn.find_executable("ryu-manager") RyuManager = spawn.find_executable("ryu-manager")
if RyuManager is None: if RyuManager is None:
raise RuntimeError("RyuManager was not found, is it installed?") raise RuntimeError("RyuManager was not found, is it installed?")
@ -687,13 +708,13 @@ class BridgeController(ControllerModule):
] ]
self._bf_proc = subprocess.Popen(cmd) self._bf_proc = subprocess.Popen(cmd)
def stop_bf_module(self, wt: int = 1.15): def _stop_bf_module(self, wt: int = 1.15):
if self._bf_proc is not None: if self._bf_proc is not None:
try: try:
exit_code = self._bf_proc.poll() exit_code = self._bf_proc.poll()
if exit_code is None: if exit_code is None:
self._bf_proc.terminate() self._bf_proc.send_signal(signal.SIGINT)
self._bf_proc.wait() self._bf_proc.wait(wt)
else: else:
self.logger.debug( self.logger.debug(
"BoundedFlood process %s already exited with %s", "BoundedFlood process %s already exited with %s",
@ -707,7 +728,5 @@ class BridgeController(ControllerModule):
"Killing unresponsive BoundedFlood: %s", self._bf_proc.pid "Killing unresponsive BoundedFlood: %s", self._bf_proc.pid
) )
self._bf_proc.kill() self._bf_proc.kill()
self._bf_proc = None
self.logger.info("BoundedFlood terminated") self.logger.info("BoundedFlood terminated")
# Todo: check if BF process exited and restart if not shutting down

View File

@ -444,4 +444,4 @@ class GeneveTunnel(ControllerModule):
if tnl.state == TUNNEL_STATES.AUTHORIZED: if tnl.state == TUNNEL_STATES.AUTHORIZED:
self._deauth_tnl(tnl) self._deauth_tnl(tnl)
else: else:
self._rollback_tnl([tnl]) self._rollback_tnl(tnl)

View File

@ -83,6 +83,9 @@ class Tunnel:
def tunnel_state(self, new_state): def tunnel_state(self, new_state):
self.state = new_state self.state = new_state
def is_tnl_online(self) -> bool:
return bool(self.tunnel_state == TUNNEL_STATES.ONLINE)
class LinkManager(ControllerModule): class LinkManager(ControllerModule):
TAPNAME_MAXLEN = 15 TAPNAME_MAXLEN = 15
@ -375,13 +378,15 @@ class LinkManager(ControllerModule):
self._tunnels[tnlid].link.status_retry = 0 self._tunnels[tnlid].link.status_retry = 0
elif cbt.request.params["Command"] == "LinkDisconnected": elif cbt.request.params["Command"] == "LinkDisconnected":
if self._tunnels[tnlid].tunnel_state != TUNNEL_STATES.QUERYING: if self._tunnels[tnlid].tunnel_state != TUNNEL_STATES.QUERYING:
self.logger.debug("Link %s is disconnected", tnlid)
# issue a link state check only if it not already being done # issue a link state check only if it not already being done
self.logger.debug("Link %s is disconnected", tnlid)
self._tunnels[tnlid].tunnel_state = TUNNEL_STATES.QUERYING self._tunnels[tnlid].tunnel_state = TUNNEL_STATES.QUERYING
cbt.set_response(data=None, status=True) cbt.set_response(data=None, status=True)
self.register_cbt( self.register_deferred_call(
"TincanTunnel", "TCI_QUERY_LINK_INFO", {"TunnelId": tnlid} 5,
) self.register_cbt,
("TincanTunnel", "TCI_QUERY_LINK_INFO", {"TunnelId": tnlid}),
) # issue link stat check in 5 secs as the link can reconnect
elif cbt.request.params["Command"] == "TincanTunnelFailed": elif cbt.request.params["Command"] == "TincanTunnelFailed":
lnkid = self.link_id(tnlid) lnkid = self.link_id(tnlid)
if lnkid: if lnkid:
@ -736,8 +741,7 @@ class LinkManager(ControllerModule):
return ign_netinf return ign_netinf
def is_tnl_online(self, tnl: Tunnel) -> bool: def is_tnl_online(self, tnl: Tunnel) -> bool:
return bool(tnl.tunnel_state == TUNNEL_STATES.ONLINE) return tnl.is_tnl_online()
# return bool(tnl.link and tnl.link.creation_state == 0xC0)
def _remove_link_from_tunnel(self, tnlid): def _remove_link_from_tunnel(self, tnlid):
tnl = self._tunnels.get(tnlid) tnl = self._tunnels.get(tnlid)
@ -855,7 +859,8 @@ class LinkManager(ControllerModule):
self.complete_cbt(parent_cbt) self.complete_cbt(parent_cbt)
return return
lnkid = self.link_id(tnlid) lnkid = self.link_id(tnlid)
self._tunnels[tnlid].link.creation_state = 0xC0 tnl = self._tunnels[tnlid]
tnl.link.creation_state = 0xC0
self.logger.debug( self.logger.debug(
"Creating link %s to peer %s (5/5 Initiator)", tnlid[:7], peer_id[:7] "Creating link %s to peer %s (5/5 Initiator)", tnlid[:7], peer_id[:7]
) )
@ -868,6 +873,13 @@ class LinkManager(ControllerModule):
self.node_id[:7], self.node_id[:7],
peer_id[:7], peer_id[:7],
) )
if not tnl.is_tnl_online():
self.register_timed_transaction(
tnl,
self.is_tnl_online,
self.on_tnl_timeout,
LINK_SETUP_TIMEOUT,
)
def _complete_link_endpt_request(self, cbt: CBT): def _complete_link_endpt_request(self, cbt: CBT):
# Create Link: Phase 4 Node B # Create Link: Phase 4 Node B
@ -1036,40 +1048,3 @@ class LinkManager(ControllerModule):
self._tunnels.pop(tnl.tnlid, None) self._tunnels.pop(tnl.tnlid, None)
if tnl.link: if tnl.link:
self._links.pop(tnl.link.lnkid, None) self._links.pop(tnl.link.lnkid, None)
""" TODO: OUTDATED, NEED TO BE UPDATED
###################################################################################################
Link Manager state and event specifications
###################################################################################################
If LM fails a CBT there will be no further events fired for the tunnel.
Once tunnel goes online an explicit CBT LNK_REMOVE_TUNNEL is required.
Partially created tunnels that fails will be removed automatically by LM.
Events
(1) TunnelEvents.AuthExpired - After a successful completion of CBT LNK_AUTH_TUNNEL, the tunnel
descriptor is created and TunnelEvents.Authorized is fired.
(2) TunnelEvents.AuthExpired - If no action is taken on the tunnel within LinkSetupTimeout LM will
fire TunnelEvents.AuthExpired and remove the associated tunnel descriptor.
(3) ##REMOVED## TunnelEvents.Created - On both nodes A & B, on a successful completion of CBT
TCI_CREATE_TUNNEL, the TAP device exists and TunnelEvents.Created is fired.
(4) TunnelEvents.Connected - After Tincan delivers the online event to LM TunnelEvents.Connected
is fired.
(5) TunnelEvents.Disconnected - After Tincan signals link offline or QUERYy_LNK_STATUS discovers
offline TunnelEvents.Disconnected is fired.
(6) TunnelEvents.Removed - After the TAP device is removed TunnelEvents.Removed is fired and the
tunnel descriptor is removed. Tunnel must be in TUNNEL_STATES.ONLINE or TUNNEL_STATES.OFFLINE
Internal States
(1) TUNNEL_STATES.AUTHORIZED - After a successful completion of CBT LNK_AUTH_TUNNEL, the tunnel
descriptor exists.
(2) TUNNEL_STATES.CREATING - entered on reception of CBT LNK_CREATE_TUNNEL.
(3) TUNNEL_STATES.QUERYING - entered before issuing CBT TCI_QUERY_LINK_INFO. Happens when
LinkStateChange is LINK_STATE_DOWN and state is not already TUNNEL_STATES.QUERYING; OR
TCI_QUERY_LINK_INFO is OFFLINE and state is not already TUNNEL_STATES.QUERYING.
(4) TUNNEL_STATES.ONLINE - entered when CBT TCI_QUERY_LINK_INFO is ONLINE or LinkStateChange is
LINK_STATE_UP.
(5) TUNNEL_STATES.OFFLINE - entered when QUERY_LNK_STATUS is OFFLINE or LinkStateChange is
LINK_STATE_DOWN event.
"""

View File

@ -39,7 +39,7 @@ from typing import Optional, Tuple, Union
import broker import broker
import slixmpp import slixmpp
from broker import CACHE_EXPIRY_INTERVAL, PRESENCE_INTERVAL, statement_false from broker import CACHE_EXPIRY_INTERVAL, PRESENCE_INTERVAL
from broker.cbt import CBT from broker.cbt import CBT
from broker.controller_module import ControllerModule from broker.controller_module import ControllerModule
from broker.remote_action import RemoteAction from broker.remote_action import RemoteAction
@ -403,17 +403,13 @@ class XmppTransport(slixmpp.ClientXMPP):
self.loop.close() self.loop.close()
self.logger.debug("Event loop closed on XMPP overlay=%s", self._overlay_id) self.logger.debug("Event loop closed on XMPP overlay=%s", self._overlay_id)
def shutdown( def shutdown(self):
self, self.logger.debug("Initiating shutdown of XMPP overlay %s", self._overlay_id)
): self.loop.call_soon_threadsafe(self.disconnect, 2, "controller shutdown", True)
self.logger.debug("Initiating shutdown of XMPP overlay=%s", self._overlay_id)
self.disconnect(reason="controller shutdown", ignore_send_queue=True)
# self.loop.call_soon_threadsafe(self.disconnect(reason="controller shutdown", ignore_send_queue=True))
self.logger.debug("Disconnect of XMPP overlay=%s", self._overlay_id)
class XmppCircle: class XmppCircle:
_REFLECT: list[str] = ["xport", "_transmission_queue", "jid_cache"] _REFLECT: list[str] = ["xport", "transmit_queues", "jid_cache"]
def __init__( def __init__(
self, node_id: str, overlay_id: str, ovl_config: dict, **kwargs self, node_id: str, overlay_id: str, ovl_config: dict, **kwargs
@ -507,11 +503,9 @@ class Signal(ControllerModule):
) )
self._circles[olid] = xcir self._circles[olid] = xcir
xcir.start() xcir.start()
self.register_timed_transaction( self.register_deferred_call(
self,
statement_false,
self.on_exp_presence,
PRESENCE_INTERVAL * random.randint(1, 5), PRESENCE_INTERVAL * random.randint(1, 5),
self.on_exp_presence,
) )
self.logger.info("Controller module loaded") self.logger.info("Controller module loaded")
@ -520,16 +514,13 @@ class Signal(ControllerModule):
20, 50 20, 50
) )
def on_exp_presence(self, *_): def on_exp_presence(self):
with self._lck: with self._lck:
for circ in self._circles.values(): for circ in self._circles.values():
if circ.xport and circ.xport.is_connected(): if circ.xport and circ.xport.is_connected():
circ.xport.send_presence_safe(pstatus="ident#" + self.node_id) circ.xport.send_presence_safe(pstatus="ident#" + self.node_id)
self.register_timed_transaction( self.register_deferred_call(
self, self._next_anc_interval(), self.on_exp_presence
statement_false,
self.on_exp_presence,
self._next_anc_interval(),
) )
def on_presence(self, msg): def on_presence(self, msg):
@ -709,33 +700,26 @@ class Signal(ControllerModule):
def scavenge_expired_outgoing_rem_acts(self, outgoing_rem_acts: dict[str, Queue]): def scavenge_expired_outgoing_rem_acts(self, outgoing_rem_acts: dict[str, Queue]):
# clear out the JID Refresh queue for a peer if the oldest entry age exceeds the limit # clear out the JID Refresh queue for a peer if the oldest entry age exceeds the limit
peer_ids = [] peer_ids = []
for peer_id in outgoing_rem_acts: for peer_id, transmit_queue in outgoing_rem_acts.items():
peer_qlen = outgoing_rem_acts[peer_id].qsize() if transmit_queue.queue:
if not outgoing_rem_acts[peer_id].queue: remact_descr = transmit_queue.queue[0] # peek at the first/oldest entry
continue if time.time() - remact_descr[2] >= self._jid_resolution_timeout:
remact_descr = outgoing_rem_acts[peer_id].queue[ peer_ids.append(peer_id)
0 self.logger.debug(
] # peek at the first/oldest entry "Remote act scavenged for removal %s", remact_descr
if time.time() - remact_descr[2] >= self._jid_resolution_timeout: )
peer_ids.append(peer_id)
self.logger.debug(
"Remote acts scavenged for removal peer_id %s qlength %d",
peer_id,
peer_qlen,
)
for peer_id in peer_ids: for peer_id in peer_ids:
remact_que = outgoing_rem_acts.pop(peer_id, Queue()) transmit_queue: Queue = outgoing_rem_acts[peer_id]
while True: try:
try: remact = transmit_queue.get_nowait()
remact = remact_que.get_nowait() except Empty:
except Empty: return
break tag = remact[1].action_tag
else: cbt = self._cbts_pending_remote_resp.pop(tag, None)
tag = remact[1].action_tag # if cbt:
cbt = self._cbts_pending_remote_resp.pop(tag, None) cbt.set_response("Peer lookup failed", False)
cbt.set_response("Peer lookup failed", False) self.complete_cbt(cbt)
self.complete_cbt(cbt) transmit_queue.task_done()
remact_que.task_done()
def _setup_circle(self, overlay_id: str): def _setup_circle(self, overlay_id: str):
xcir = XmppCircle( xcir = XmppCircle(

View File

@ -26,10 +26,11 @@ except ImportError:
import subprocess import subprocess
import time import time
from copy import deepcopy
from threading import Event from threading import Event
import broker import broker
from broker import TINCAN_CHK_INTERVAL, statement_false from broker import TINCAN_CHK_INTERVAL
from broker.cbt import CBT from broker.cbt import CBT
from broker.controller_module import ControllerModule from broker.controller_module import ControllerModule
from broker.process_proxy import ProxyMsg from broker.process_proxy import ProxyMsg
@ -83,7 +84,7 @@ class TincanTunnel(ControllerModule):
self._register_req_handlers() self._register_req_handlers()
self._register_resp_handlers() self._register_resp_handlers()
self._tci_publisher = self.publish_subscription("TCI_TUNNEL_EVENT") self._tci_publisher = self.publish_subscription("TCI_TUNNEL_EVENT")
self.on_expire_chk_tincan() self.register_deferred_call(TINCAN_CHK_INTERVAL, self.on_expire_chk_tincan)
self.logger.info("Controller module loaded") self.logger.info("Controller module loaded")
def _register_abort_handlers(self): def _register_abort_handlers(self):
@ -135,7 +136,7 @@ class TincanTunnel(ControllerModule):
def _create_tunnel(self, cbt: CBT): def _create_tunnel(self, cbt: CBT):
msg = cbt.request.params msg = cbt.request.params
tnlid = msg["TunnelId"] tnlid = msg["TunnelId"]
ctl = broker.CTL_CREATE_TUNNEL ctl = deepcopy(broker.CTL_CREATE_TUNNEL)
ctl["TransactionId"] = cbt.tag ctl["TransactionId"] = cbt.tag
req = ctl["Request"] req = ctl["Request"]
req["StunServers"] = msg["StunServers"] req["StunServers"] = msg["StunServers"]
@ -147,9 +148,6 @@ class TincanTunnel(ControllerModule):
tc_proc = self._tc_proc_tbl[tnlid] tc_proc = self._tc_proc_tbl[tnlid]
self._tnl_cbts[cbt.tag] = cbt self._tnl_cbts[cbt.tag] = cbt
self.send_control(tc_proc.ipc_id, json.dumps(ctl)) self.send_control(tc_proc.ipc_id, json.dumps(ctl))
for turn in req["TurnServers"]:
turn["User"] = "***"
turn["Password"] = "***"
def req_handler_create_link(self, cbt: CBT): def req_handler_create_link(self, cbt: CBT):
try: try:
@ -159,6 +157,8 @@ class TincanTunnel(ControllerModule):
cbt.add_context("OnRegister", self._create_link) cbt.add_context("OnRegister", self._create_link)
self._tnl_cbts[tnlid] = cbt self._tnl_cbts[tnlid] = cbt
self._start_tincan(tnlid) self._start_tincan(tnlid)
self._tc_proc_tbl[tnlid].ovlid = msg["OverlayId"]
self._tc_proc_tbl[tnlid].tap_name = msg["TapName"]
else: else:
self._create_link(cbt) self._create_link(cbt)
except Exception: except Exception:
@ -169,7 +169,7 @@ class TincanTunnel(ControllerModule):
def _create_link(self, cbt: CBT): def _create_link(self, cbt: CBT):
msg = cbt.request.params msg = cbt.request.params
tnlid = msg["TunnelId"] tnlid = msg["TunnelId"]
ctl = broker.CTL_CREATE_LINK ctl = deepcopy(broker.CTL_CREATE_LINK)
ctl["TransactionId"] = cbt.tag ctl["TransactionId"] = cbt.tag
req = ctl["Request"] req = ctl["Request"]
req["TunnelId"] = tnlid req["TunnelId"] = tnlid
@ -187,9 +187,6 @@ class TincanTunnel(ControllerModule):
tc_proc = self._tc_proc_tbl[tnlid] tc_proc = self._tc_proc_tbl[tnlid]
self._tnl_cbts[cbt.tag] = cbt self._tnl_cbts[cbt.tag] = cbt
self.send_control(tc_proc.ipc_id, json.dumps(ctl)) self.send_control(tc_proc.ipc_id, json.dumps(ctl))
for turn in req["TurnServers"]:
turn["User"] = "***"
turn["Password"] = "***"
def req_handler_query_candidate_address_set(self, cbt: CBT): def req_handler_query_candidate_address_set(self, cbt: CBT):
msg = cbt.request.params msg = cbt.request.params
@ -198,7 +195,7 @@ class TincanTunnel(ControllerModule):
err_msg = f"No tunnel exists for tunnel ID: {tnlid[:7]}" err_msg = f"No tunnel exists for tunnel ID: {tnlid[:7]}"
cbt.set_response({"ErrorMsg": err_msg, "Status": False}) cbt.set_response({"ErrorMsg": err_msg, "Status": False})
return return
ctl = broker.CTL_QUERY_CAS ctl = deepcopy(broker.CTL_QUERY_CAS)
ctl["TransactionId"] = cbt.tag ctl["TransactionId"] = cbt.tag
ctl["Request"]["TunnelId"] = tnlid ctl["Request"]["TunnelId"] = tnlid
tc_proc = self._tc_proc_tbl[tnlid] tc_proc = self._tc_proc_tbl[tnlid]
@ -206,17 +203,16 @@ class TincanTunnel(ControllerModule):
self.send_control(tc_proc.ipc_id, json.dumps(ctl)) self.send_control(tc_proc.ipc_id, json.dumps(ctl))
def req_handler_query_link_stats(self, cbt: CBT): def req_handler_query_link_stats(self, cbt: CBT):
msg = cbt.request.params tnlid = cbt.request.params["TunnelId"]
tnlid = msg["TunnelId"] tc_proc = self._tc_proc_tbl.get(tnlid)
if tnlid not in self._tc_proc_tbl: if not tc_proc:
err_msg = f"No tunnel exists for tunnel ID: {tnlid[:7]}" err_msg = f"No tunnel exists for tunnel ID: {tnlid[:7]}"
cbt.set_response({"ErrorMsg": err_msg, "Status": False}) cbt.set_response({"ErrorMsg": err_msg, "Status": False})
self.complete_cbt(cbt) self.complete_cbt(cbt)
return return
ctl = broker.CTL_QUERY_LINK_STATS ctl = deepcopy(broker.CTL_QUERY_LINK_STATS)
ctl["TransactionId"] = cbt.tag ctl["TransactionId"] = cbt.tag
ctl["Request"]["TunnelId"] = tnlid ctl["Request"]["TunnelId"] = tnlid
tc_proc = self._tc_proc_tbl[tnlid]
self._tnl_cbts[cbt.tag] = cbt self._tnl_cbts[cbt.tag] = cbt
self.send_control(tc_proc.ipc_id, json.dumps(ctl)) self.send_control(tc_proc.ipc_id, json.dumps(ctl))
@ -228,8 +224,9 @@ class TincanTunnel(ControllerModule):
cbt.set_response(err_msg, True) cbt.set_response(err_msg, True)
self.complete_cbt(cbt) self.complete_cbt(cbt)
return return
self.logger.debug("Removing tunnel %s", tnlid) self.logger.info("Removing tunnel %s", tnlid)
tc_proc = self._tc_proc_tbl.pop(tnlid, None) tc_proc = self._tc_proc_tbl.pop(tnlid, None)
self._pids.pop(tc_proc.proc.pid, None)
self._stop_tincan(tc_proc) self._stop_tincan(tc_proc)
cbt.set_response("Tunnel removed", True) cbt.set_response("Tunnel removed", True)
self.complete_cbt(cbt) self.complete_cbt(cbt)
@ -242,7 +239,7 @@ class TincanTunnel(ControllerModule):
cbt.set_response({"ErrorMsg": err_msg, "Status": False}) cbt.set_response({"ErrorMsg": err_msg, "Status": False})
self.complete_cbt(cbt) self.complete_cbt(cbt)
return return
ctl = broker.CTL_REMOVE_LINK ctl = deepcopy(broker.CTL_REMOVE_LINK)
ctl["TransactionId"] = cbt.tag ctl["TransactionId"] = cbt.tag
req = ctl["Request"] req = ctl["Request"]
req["TunnelId"] = tnlid req["TunnelId"] = tnlid
@ -252,16 +249,18 @@ class TincanTunnel(ControllerModule):
self.send_control(tc_proc.ipc_id, json.dumps(ctl)) self.send_control(tc_proc.ipc_id, json.dumps(ctl))
def req_handler_send_echo(self, cbt: CBT): def req_handler_send_echo(self, cbt: CBT):
ctl = broker.CTL_ECHO ctl = deepcopy(broker.CTL_ECHO)
ctl["TransactionId"] = cbt.tag ctl["TransactionId"] = cbt.tag
tnlid = cbt.request.params tnlid = cbt.request.params
tc_proc = self._tc_proc_tbl.get(tnlid) tc_proc = self._tc_proc_tbl.get(tnlid)
if tc_proc.do_chk and tc_proc.echo_replies > 0: if tc_proc and tc_proc.do_chk and tc_proc.echo_replies > 0:
tc_proc.echo_replies -= 1 tc_proc.echo_replies -= 1
ctl["Request"]["Message"] = tc_proc.tnlid ctl["Request"]["Message"] = tc_proc.tnlid
self._tnl_cbts[cbt.tag] = cbt self._tnl_cbts[cbt.tag] = cbt
self.send_control(tc_proc.ipc_id, json.dumps(ctl)) self.send_control(tc_proc.ipc_id, json.dumps(ctl))
else: else:
tc_proc.do_chk = False
cbt.set_response(f"Cannot send echo to {tc_proc}", False) cbt.set_response(f"Cannot send echo to {tc_proc}", False)
self.complete_cbt(cbt) self.complete_cbt(cbt)
@ -269,6 +268,9 @@ class TincanTunnel(ControllerModule):
tnlid = cbt.response.data tnlid = cbt.response.data
if cbt.response.status and tnlid in self._tc_proc_tbl: if cbt.response.status and tnlid in self._tc_proc_tbl:
self._tc_proc_tbl[tnlid].echo_replies = broker.MAX_HEARTBEATS self._tc_proc_tbl[tnlid].echo_replies = broker.MAX_HEARTBEATS
self.register_internal_cbt(
"TCI_QUERY_LINK_INFO", {"TunnelId": tnlid}, lifespan=15
)
else: else:
self.logger.info(cbt.response.data) self.logger.info(cbt.response.data)
self.free_cbt(cbt) self.free_cbt(cbt)
@ -288,32 +290,35 @@ class TincanTunnel(ControllerModule):
else: else:
# tincan process unresponsive # tincan process unresponsive
self.logger.warning( self.logger.warning(
"unnel: %s health check failed, terminating process: %s", "Tunnel: %s health check failed, terminating process: %s",
tnlid, tnlid,
tc_proc, tc_proc,
) )
self._stop_tincan(tc_proc) self._pids.pop(tc_proc.proc.pid, None)
self._notify_tincan_terminated(tnlid)
self._tc_proc_tbl.pop(tnlid, None) self._tc_proc_tbl.pop(tnlid, None)
self._stop_tincan(tc_proc)
self._notify_tincan_terminated(tc_proc)
def req_handler_check_process(self, cbt): def req_handler_check_process(self, cbt: CBT):
if self.exit_ev.is_set():
return
exit_code = None exit_code = None
rmv = [] rmv = []
for tnlid, tc_proc in self._tc_proc_tbl.items(): for tc_proc in self._tc_proc_tbl.values():
exit_code = tc_proc.proc.poll() exit_code = tc_proc.proc.poll()
if exit_code: if exit_code:
# tincan process crashed # tincan process crashed
rmv.append(tnlid) self.logger.warning(
for tnlid in rmv: "Tincan process %s exited unexpectedly with code %s",
self.logger.warning( tc_proc.proc.pid,
"Tincan process %s exited unexpectedly with code, %s", exit_code,
tc_proc.proc.pid, )
exit_code, rmv.append(tc_proc)
) for tc_proc in rmv:
self._notify_tincan_terminated(tnlid) self._pids.pop(tc_proc.proc.pid)
self._tc_proc_tbl.pop(tnlid, None) self._tc_proc_tbl.pop(tc_proc.tnlid, None)
self._remove_tap(tc_proc.tap_name)
self._notify_tincan_terminated(tc_proc)
cbt.set_response(rmv, True)
self.complete_cbt(cbt)
def on_timer_event(self): def on_timer_event(self):
if self.exit_ev.is_set(): if self.exit_ev.is_set():
@ -321,18 +326,14 @@ class TincanTunnel(ControllerModule):
# send an echo health check every timer interval, eg., 30s # send an echo health check every timer interval, eg., 30s
for tnlid, tc_proc in self._tc_proc_tbl.items(): for tnlid, tc_proc in self._tc_proc_tbl.items():
if tc_proc.do_chk: if tc_proc.do_chk:
self.register_internal_cbt("_TCI_SEND_ECHO", tnlid) self.register_internal_cbt("_TCI_SEND_ECHO", tnlid, lifespan=10)
def on_expire_chk_tincan(self, *_): def on_expire_chk_tincan(self):
if self.exit_ev.is_set(): if self.exit_ev.is_set():
return return
self.register_internal_cbt("_TCI_CHK_PROCESS") if self._tc_proc_tbl:
self.register_timed_transaction( self.register_internal_cbt("_TCI_CHK_PROCESS")
self, self.register_deferred_call(TINCAN_CHK_INTERVAL, self.on_expire_chk_tincan)
statement_false,
self.on_expire_chk_tincan,
TINCAN_CHK_INTERVAL,
)
def terminate(self): def terminate(self):
self.exit_ev.set() self.exit_ev.set()
@ -399,34 +400,35 @@ class TincanTunnel(ControllerModule):
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
exit_code = tc_proc.proc.poll() exit_code = tc_proc.proc.poll()
if exit_code is None: if exit_code is None:
self._remove_tap() self._remove_tap(tc_proc.tap_name)
tc_proc.proc.kill() tc_proc.proc.kill()
self._kill_times.append(self._kill_times[-1] + time.time() - ts) self._kill_times.append(self._kill_times[-1] + time.time() - ts)
self.logger.debug("Killed unresponsive Tincan: %s", tc_proc.proc.pid) self.logger.debug("Killed unresponsive Tincan: %s", tc_proc.proc.pid)
self._pids.pop(tc_proc.proc.pid)
self.logger.info( self.logger.info(
"Process %s for tunnel %s terminated", tc_proc.proc.pid, tc_proc.tnlid "Process %s for tunnel %s terminated", tc_proc.proc.pid, tc_proc.tnlid
) )
def _notify_tincan_terminated(self, tnlid: str): def _notify_tincan_terminated(self, tc_proc: TincanProcess):
self._tci_publisher.post_update( self._tci_publisher.post_update(
{ {
"Command": "TincanTunnelFailed", "Command": "TincanTunnelFailed",
"Reason": "Tincan process terminated", "Reason": "Tincan process terminated",
"OverlayId": self._tc_proc_tbl[tnlid].ovlid, "OverlayId": tc_proc.ovlid,
"TunnelId": tnlid, "TunnelId": tc_proc.tnlid,
"TapName": self._tc_proc_tbl[tnlid].tap_name, "TapName": tc_proc.tap_name,
} }
) )
def handle_ipc(self, msg: ProxyMsg): def handle_ipc(self, msg: ProxyMsg):
if self.exit_ev.is_set():
return
try: try:
ctl = msg.json ctl = msg.json
if ctl["ProtocolVersion"] != EVIO_VER_CTL: if ctl["ProtocolVersion"] != EVIO_VER_CTL:
raise ValueError("Invalid control version detected") raise ValueError("Invalid control version detected")
# self.logger.debug("Received dataplane control - %s", ctl)
# Get the original CBT if this is the response # Get the original CBT if this is the response
if ctl["ControlType"] == "Response": if ctl["ControlType"] == "Response":
# self.logger.debug("Received Tincan control response: %s", ctl)
cbt = self._tnl_cbts.pop(ctl["TransactionId"]) cbt = self._tnl_cbts.pop(ctl["TransactionId"])
cbt.set_response( cbt.set_response(
ctl["Response"]["Message"], ctl["Response"]["Message"],

View File

@ -31,14 +31,13 @@ from random import randint
from typing import Optional from typing import Optional
import broker import broker
from broker import ( from broker import ( # PEER_DISCOVERY_COALESCE,
CBT_LIFESPAN, CBT_LIFESPAN,
EXCLUSION_BASE_INTERVAL, EXCLUSION_BASE_INTERVAL,
MAX_CONCURRENT_OPS, MAX_CONCURRENT_OPS,
MAX_ON_DEMAND_EDGES, MAX_ON_DEMAND_EDGES,
MAX_SUCCESSIVE_FAILS, MAX_SUCCESSIVE_FAILS,
MIN_SUCCESSORS, MIN_SUCCESSORS,
PEER_DISCOVERY_COALESCE,
STALE_INTERVAL, STALE_INTERVAL,
SUCCESSIVE_FAIL_DECR, SUCCESSIVE_FAIL_DECR,
SUCCESSIVE_FAIL_INCR, SUCCESSIVE_FAIL_INCR,
@ -355,18 +354,7 @@ class Topology(ControllerModule):
disc = DiscoveredPeer(peer_id) disc = DiscoveredPeer(peer_id)
self._net_ovls[olid].known_peers[peer_id] = disc self._net_ovls[olid].known_peers[peer_id] = disc
disc.presence() disc.presence()
if disc.is_available: self._update_overlay(olid)
self._net_ovls[olid].new_peer_count += 1
if self._net_ovls[olid].new_peer_count >= self.config.get(
"PeerDiscoveryCoalesce", PEER_DISCOVERY_COALESCE
):
self.logger.info(
"Coalesced %d of %d discovered peers, attempting update on overlay %s",
self._net_ovls[olid].new_peer_count,
self.config.get("PeerDiscoveryCoalesce", PEER_DISCOVERY_COALESCE),
olid,
)
self._update_overlay(olid)
cbt.set_response(None, True) cbt.set_response(None, True)
self.complete_cbt(cbt) self.complete_cbt(cbt)
@ -678,6 +666,14 @@ class Topology(ControllerModule):
self._process_next_transition(ovl) self._process_next_transition(ovl)
else: else:
self.free_cbt(cbt) self.free_cbt(cbt)
ce = ovl.adjacency_list.get(peer_id)
if ce.edge_state != EDGE_STATES.Connected:
self.register_timed_transaction(
(ce, olid),
self._is_connedge_connected,
self._on_connedge_timeout,
30,
)
def resp_handler_remove_tnl(self, cbt: CBT): def resp_handler_remove_tnl(self, cbt: CBT):
params = cbt.request.params params = cbt.request.params
@ -1074,9 +1070,9 @@ class Topology(ControllerModule):
raise ValueError(f"Invalid request: Undefinfed tunnel type {dataplane}") raise ValueError(f"Invalid request: Undefinfed tunnel type {dataplane}")
def _initiate_remove_edge(self, net_ovl: NetworkOverlay, peer_id: str): def _initiate_remove_edge(self, net_ovl: NetworkOverlay, peer_id: str):
if peer_id not in net_ovl.adjacency_list: ce = net_ovl.adjacency_list.get(peer_id)
raise RuntimeWarning("No connection edge to peer found") if not ce:
ce = net_ovl.adjacency_list[peer_id] return
if ( if (
ce.edge_state == EDGE_STATES.Connected ce.edge_state == EDGE_STATES.Connected
and ce.role == CONNECTION_ROLE.Initiator and ce.role == CONNECTION_ROLE.Initiator
@ -1091,8 +1087,6 @@ class Topology(ControllerModule):
raise ValueError("Successor threshold not met") raise ValueError("Successor threshold not met")
self.logger.debug("Removing edge %s", ce) self.logger.debug("Removing edge %s", ce)
self._remove_tunnel(net_ovl, ce.dataplane, ce.peer_id, ce.edge_id) self._remove_tunnel(net_ovl, ce.dataplane, ce.peer_id, ce.edge_id)
return True
return False
def _remove_tunnel( def _remove_tunnel(
self, self,

View File

@ -30,7 +30,7 @@ import urllib.request as request
from urllib.error import HTTPError, URLError from urllib.error import HTTPError, URLError
from broker.cbt import CBT from broker.cbt import CBT
from broker.controller_module import ControllerModule from broker.controller_module import ControllerModule, introspect
class UsageReport(ControllerModule): class UsageReport(ControllerModule):
@ -44,6 +44,9 @@ class UsageReport(ControllerModule):
"NodeId": hashlib.sha256(self.node_id.encode("utf-8")).hexdigest(), "NodeId": hashlib.sha256(self.node_id.encode("utf-8")).hexdigest(),
} }
def __repr__(self):
return introspect(self)
def initialize(self): def initialize(self):
self.logger.info("Controller module loaded") self.logger.info("Controller module loaded")