Slony-I 2.0.8 Documentation | ||||
---|---|---|---|---|
Prev | Fast Backward | Chapter 6. Schema schemadoc | Fast Forward | Next |
6.63. failednode(integer, integer)
Function Properties
Language: PLPGSQL
Return Type: integer
Initiate failover from failed_node to backup_node. This function must be called on all nodes, and then waited for the restart of all node daemons.declare p_failed_node alias for $1; p_backup_node alias for $2; v_row record; v_row2 record; v_n int4; begin -- ---- -- Grab the central configuration lock -- ---- lock table sl_config_lock; -- ---- -- All consistency checks first -- Check that every node that has a path to the failed node -- also has a path to the backup node. -- ---- for v_row in select P.pa_client from sl_path P where P.pa_server = p_failed_node and P.pa_client <> p_backup_node and not exists (select true from sl_path PP where PP.pa_server = p_backup_node and PP.pa_client = P.pa_client) loop raise exception 'Slony-I: cannot failover - node % has no path to the backup node', v_row.pa_client; end loop; -- ---- -- Check all sets originating on the failed node -- ---- for v_row in select set_id from sl_set where set_origin = p_failed_node loop -- ---- -- Check that the backup node is subscribed to all sets -- that originate on the failed node -- ---- select into v_row2 sub_forward, sub_active from sl_subscribe where sub_set = v_row.set_id and sub_receiver = p_backup_node; if not found then raise exception 'Slony-I: cannot failover - node % is not subscribed to set %', p_backup_node, v_row.set_id; end if; -- ---- -- Check that the subscription is active -- ---- if not v_row2.sub_active then raise exception 'Slony-I: cannot failover - subscription for set % is not active', v_row.set_id; end if; -- ---- -- If there are other subscribers, the backup node needs to -- be a forwarder too. -- ---- select into v_n count(*) from sl_subscribe where sub_set = v_row.set_id and sub_receiver <> p_backup_node; if v_n > 0 and not v_row2.sub_forward then raise exception 'Slony-I: cannot failover - node % is not a forwarder of set %', p_backup_node, v_row.set_id; end if; end loop; -- ---- -- Terminate all connections of the failed node the hard way -- ---- perform terminateNodeConnections(p_failed_node); -- ---- -- Move the sets -- ---- for v_row in select S.set_id, (select count(*) from sl_subscribe SUB where S.set_id = SUB.sub_set and SUB.sub_receiver <> p_backup_node and SUB.sub_provider = p_failed_node) as num_direct_receivers from sl_set S where S.set_origin = p_failed_node for update loop -- ---- -- If the backup node is the only direct subscriber ... -- ---- if v_row.num_direct_receivers = 0 then raise notice 'failedNode: set % has no other direct receivers - move now', v_row.set_id; -- ---- -- backup_node is the only direct subscriber, move the set -- right now. On the backup node itself that includes restoring -- all user mode triggers, removing the protection trigger, -- adding the log trigger, removing the subscription and the -- obsolete setsync status. -- ---- if p_backup_node = getLocalNodeId('_schemadoc') then update sl_set set set_origin = p_backup_node where set_id = v_row.set_id; delete from sl_setsync where ssy_setid = v_row.set_id; for v_row2 in select * from sl_table where tab_set = v_row.set_id order by tab_id loop perform alterTableConfigureTriggers(v_row2.tab_id); end loop; end if; delete from sl_subscribe where sub_set = v_row.set_id and sub_receiver = p_backup_node; else raise notice 'failedNode: set % has other direct receivers - change providers only', v_row.set_id; -- ---- -- Backup node is not the only direct subscriber or not -- a direct subscriber at all. -- This means that at this moment, we redirect all possible -- direct subscribers to receive from the backup node, and the -- backup node itself to receive from another one. -- The admin utility will wait for the slon engine to -- restart and then call failedNode2() on the node with -- the highest SYNC and redirect this to it on -- backup node later. -- ---- update sl_subscribe set sub_provider = (select min(SS.sub_receiver) from sl_subscribe SS where SS.sub_set = v_row.set_id and SS.sub_receiver <> p_backup_node and SS.sub_forward and exists ( select 1 from sl_path where pa_server = SS.sub_receiver and pa_client = p_backup_node )) where sub_set = v_row.set_id and sub_receiver = p_backup_node; update sl_subscribe set sub_provider = (select min(SS.sub_receiver) from sl_subscribe SS where SS.sub_set = v_row.set_id and SS.sub_receiver <> p_failed_node and SS.sub_forward and exists ( select 1 from sl_path where pa_server = SS.sub_receiver and pa_client = sl_subscribe.sub_receiver )) where sub_set = v_row.set_id and sub_receiver <> p_backup_node; update sl_subscribe set sub_provider = p_backup_node where sub_set = v_row.set_id and sub_receiver <> p_backup_node and exists ( select 1 from sl_path where pa_server = p_backup_node and pa_client = sl_subscribe.sub_receiver ); delete from sl_subscribe where sub_set = v_row.set_id and sub_receiver = p_backup_node; end if; end loop; -- Rewrite sl_listen table perform RebuildListenEntries(); -- Run addPartialLogIndices() to try to add indices to unused sl_log_? table perform addPartialLogIndices(); -- ---- -- Make sure the node daemon will restart -- ---- notify "_schemadoc_Restart"; -- ---- -- That is it - so far. -- ---- return p_failed_node; end;