[Slony1-general] Problem with execute script / slony version 1.2.17

Tue Dec 8 09:56:26 PST 2009

On Dec 8, 2009, at 1:33 AM, Maxim Boguk wrote:

> I have a bit uncommon configuration now.
> Master: postgresql 8.3
> Slave:   postgresql 8.4
> slony:   version 1.2.17
>

You know what is funny - I *JUST* ran into this problem too!
It only seems to happen when not specifying "only on xxx"

Here's a **HACK** solution to get your replication going again:

create or replace function _replication.altertableforreplication(int)
returns int
as $$
declare
	p_tab_id			alias for $1;
	v_no_id				int4;
	v_tab_row			record;
	v_tab_fqname		text;
	v_tab_attkind		text;
	v_n					int4;
	v_trec	record;
	v_tgbad	boolean;
begin
	-- ----
	-- Grab the central configuration lock
	-- ----
	lock table "_replication".sl_config_lock;

	-- ----
	-- Get our local node ID
	-- ----
	v_no_id := "_replication".getLocalNodeId('_replication');

	-- ----
	-- Get the sl_table row and the current origin of the table.
	-- Verify that the table currently is NOT in altered state.
	-- ----
	select T.tab_reloid, T.tab_set, T.tab_idxname, T.tab_altered,
			S.set_origin, PGX.indexrelid,
			"_replication".slon_quote_brute(PGN.nspname) || '.' ||
			"_replication".slon_quote_brute(PGC.relname) as tab_fqname
			into v_tab_row
			from "_replication".sl_table T, "_replication".sl_set S,
				"pg_catalog".pg_class PGC, "pg_catalog".pg_namespace PGN,
				"pg_catalog".pg_index PGX, "pg_catalog".pg_class PGXC
			where T.tab_id = p_tab_id
				and T.tab_set = S.set_id
				and T.tab_reloid = PGC.oid
				and PGC.relnamespace = PGN.oid
				and PGX.indrelid = T.tab_reloid
				and PGX.indexrelid = PGXC.oid
				and PGXC.relname = T.tab_idxname
				for update;
	if not found then
		raise exception 'Slony-I: alterTableForReplication(): Table with id  
% not found', p_tab_id;
	end if;
	v_tab_fqname = v_tab_row.tab_fqname;
	if v_tab_row.tab_altered then
		raise notice 'Slony-I: alterTableForReplication(): Table % is  
already in altered state',
				v_tab_fqname;

		return p_tab_id;
	end if;

	v_tab_attkind :=  
"_replication".determineAttKindUnique(v_tab_row.tab_fqname,
						v_tab_row.tab_idxname);

	execute 'lock table ' || v_tab_fqname || ' in access exclusive mode';

	-- ----
	-- Procedures are different on origin and subscriber
	-- ----
	if v_no_id = v_tab_row.set_origin then
		-- ----
		-- On the Origin we add the log trigger to the table and done
		-- ----
		execute 'create trigger "_replication_logtrigger_' ||
				p_tab_id::text || '" after insert or update or delete on ' ||
				v_tab_fqname || ' for each row execute procedure
				"_replication".logTrigger (''_replication'', ''' ||
					p_tab_id::text || ''', ''' ||
					v_tab_attkind || ''');';
	else
		-- ----
		-- On the subscriber the thing is a bit more difficult. We want
		-- to disable all user- and foreign key triggers and rules.
		-- ----

		-- ----
		-- Check to see if there are any trigger conflicts...
		-- ----
		v_tgbad := 'false';
		for v_trec in
			select pc.relname, tg1.tgname from
			"pg_catalog".pg_trigger tg1,
			"pg_catalog".pg_trigger tg2,
			"pg_catalog".pg_class pc,
			"pg_catalog".pg_index pi,
			"_replication".sl_table tab
			where
			 tg1.tgname = tg2.tgname and        -- Trigger names match
			 tg1.tgrelid = tab.tab_reloid and   -- trigger 1 is on the table
			 pi.indexrelid = tg2.tgrelid and    -- trigger 2 is on the index
			 pi.indrelid = tab.tab_reloid and   -- indexes table is this table
			 pc.oid = tab.tab_reloid
                 loop
			raise notice 'Slony-I: alterTableForReplication(): multiple  
instances of trigger % on table %',
				v_trec.tgname, v_trec.relname;
			v_tgbad := 'true';
		end loop;
		if v_tgbad then
			raise exception 'Slony-I: Unable to disable triggers';
		end if;  		

		-- ----
		-- Disable all existing triggers
		-- ----
		update "pg_catalog".pg_trigger
				set tgrelid = v_tab_row.indexrelid
				where tgrelid = v_tab_row.tab_reloid
				and not exists (
						select true from "_replication".sl_table TAB,
								"_replication".sl_trigger TRIG
								where TAB.tab_reloid = tgrelid
								and TAB.tab_id = TRIG.trig_tabid
								and TRIG.trig_tgname = tgname
					);
		get diagnostics v_n = row_count;
		if (v_n > 0) and exists (select 1 from information_schema.columns  
where table_name = 'pg_class' and table_schema = 'pg_catalog' and  
column_name = 'reltriggers') then
			update "pg_catalog".pg_class
					set reltriggers = reltriggers - v_n
					where oid = v_tab_row.tab_reloid;
		end if;

		-- ----
		-- Disable all existing rules
		-- ----
		update "pg_catalog".pg_rewrite
				set ev_class = v_tab_row.indexrelid
				where ev_class = v_tab_row.tab_reloid;
		get diagnostics v_n = row_count;
		if v_n > 0 then
			update "pg_catalog".pg_class
					set relhasrules = false
					where oid = v_tab_row.tab_reloid;
		end if;

		-- ----
		-- Add the trigger that denies write access to replicated tables
		-- ----
		execute 'create trigger "_replication_denyaccess_' ||
				p_tab_id::text || '" before insert or update or delete on ' ||
				v_tab_fqname || ' for each row execute procedure
				"_replication".denyAccess (''_replication'');';
	end if;

	-- ----
	-- Mark the table altered in our configuration
	-- ----
	update "_replication".sl_table
			set tab_altered = true where tab_id = p_tab_id;

	return p_tab_id;
end;
$$
language 'plpgsql';

you'll need to change the _replication to _slony - in a nuthshell it  
makes the tab being altered not an error condition (changing tht  
EXCEPTION to a NOTICE).

This is a hack solution until a proper fix is done up - was about to  
fire up some test instances and trace what happens.

> Any execute script surely break replication set. Even if i submit
> something like 'select 1' via execte script replication will be broken
> down unrepairable. Only way fix it is drop/create/subscribe slave  
> node.
> restarting slon doesn't help at all...
>
> Here is slony log from slave database:
>
> 2009-12-08 09:11:43 MSK INFO   prepare for DDL script - set:1  
> onlyonnode:-1
> 2009-12-08 09:11:43 MSK ERROR  remoteWorkerThread_1: "select
> "_slony".ddlScript_prepare_int(1, -1); " PGRES_FATAL_ERROR ERROR:
> Slony-I: alterTableRestore(): Table "public"."access_type" is not in
> altered state
> CONTEXT:  SQL statement "SELECT  "_slony".alterTableRestore( $1 )"
> PL/pgSQL function "ddlscript_prepare_int" line 46 at PERFORM
> 2009-12-08 09:11:43 MSK ERROR  remoteWorkerThread_1: DDL preparation
> failed - set 1 - only on node -1
> 2009-12-08 09:11:43 MSK DEBUG1 slon: retry requested
> 2009-12-08 09:11:43 MSK INFO   remoteListenThread_1: disconnecting
> from 'dbname=plus1 host=192.168.1.137 port=5432 user=slony'
> 2009-12-08 09:11:43 MSK DEBUG1 syncThread: thread done
> 2009-12-08 09:11:43 MSK DEBUG1 localListenThread: thread done
> 2009-12-08 09:11:43 MSK DEBUG1 remoteListenThread_1: thread done
> 2009-12-08 09:11:43 MSK DEBUG1 main: scheduler mainloop returned
> 2009-12-08 09:11:43 MSK DEBUG1 cleanupThread: thread done
> 2009-12-08 09:11:43 MSK DEBUG1 main: done
> 2009-12-08 09:11:43 MSK DEBUG1 slon: restart of worker
> 2009-12-08 09:11:43 MSK CONFIG main: slon version 1.2.17 starting up
> 2009-12-08 09:11:43 MSK CONFIG main: local node id = 2
> 2009-12-08 09:11:43 MSK CONFIG main: launching sched_start_mainloop
> 2009-12-08 09:11:43 MSK CONFIG main: loading current cluster  
> configuration
> 2009-12-08 09:11:43 MSK CONFIG storeNode: no_id=1  
> no_comment='director'
> 2009-12-08 09:11:43 MSK CONFIG storePath: pa_server=1 pa_client=2
> pa_conninfo="dbname=plus1 host=192.168.1.137 port=5432 user=slony"
> pa_connretry=10
> 2009-12-08 09:11:43 MSK CONFIG storeListen: li_origin=1 li_receiver=2
> li_provider=1
> 2009-12-08 09:11:43 MSK CONFIG storeSet: set_id=1 set_origin=1
> set_comment='A replication set so boring no one thought to give it a
> name'
> 2009-12-08 09:11:43 MSK WARN   remoteWorker_wakeup: node 1 - no  
> worker thread
> 2009-12-08 09:11:43 MSK CONFIG storeSubscribe: sub_set=1
> sub_provider=1 sub_forward='t'
> 2009-12-08 09:11:43 MSK WARN   remoteWorker_wakeup: node 1 - no  
> worker thread
> 2009-12-08 09:11:43 MSK CONFIG enableSubscription: sub_set=1
> 2009-12-08 09:11:43 MSK WARN   remoteWorker_wakeup: node 1 - no  
> worker thread
> 2009-12-08 09:11:43 MSK CONFIG main: configuration complete -  
> starting threads
> 2009-12-08 09:11:43 MSK DEBUG1 localListenThread: thread starts
> NOTICE:  Slony-I: cleanup stale sl_nodelock entry for pid=31722
> 2009-12-08 09:11:43 MSK CONFIG enableNode: no_id=1
> 2009-12-08 09:11:43 MSK DEBUG1 remoteListenThread_1: thread starts
> 2009-12-08 09:11:43 MSK DEBUG1 remoteWorkerThread_1: thread starts
> 2009-12-08 09:11:43 MSK DEBUG1 cleanupThread: thread starts
> 2009-12-08 09:11:43 MSK DEBUG1 syncThread: thread starts
> 2009-12-08 09:11:43 MSK DEBUG1 main: running scheduler mainloop
> 2009-12-08 09:11:43 MSK DEBUG1 remoteWorkerThread_1: helper thread for
> provider 1 created
> 2009-12-08 09:11:43 MSK DEBUG1 remoteListenThread_1: connected to
> 'dbname=plus1 host=192.168.1.137 port=5432 user=slony'
> 2009-12-08 09:11:43 MSK INFO   Checking local node id
> 2009-12-08 09:11:43 MSK INFO   Found local node id
> 2009-12-08 09:11:43 MSK INFO   prepare for DDL script - set:1  
> onlyonnode:-1
> 2009-12-08 09:11:43 MSK ERROR  remoteWorkerThread_1: "select
> "_slony".ddlScript_prepare_int(1, -1); " PGRES_FATAL_ERROR ERROR:
> Slony-I: alterTableRestore(): Table "public"."access_type" is not in
> altered state
> CONTEXT:  SQL statement "SELECT  "_slony".alterTableRestore( $1 )"
> PL/pgSQL function "ddlscript_prepare_int" line 46 at PERFORM
> 2009-12-08 09:11:43 MSK ERROR  remoteWorkerThread_1: DDL preparation
> failed - set 1 - only on node -1
> 2009-12-08 09:11:43 MSK DEBUG1 slon: retry requested
> 2009-12-08 09:11:43 MSK INFO   remoteListenThread_1: disconnecting
> from 'dbname=plus1 host=192.168.1.137 port=5432 user=slony'
> 2009-12-08 09:11:43 MSK DEBUG1 remoteListenThread_1: thread done
> 2009-12-08 09:11:43 MSK DEBUG1 localListenThread: thread done
> 2009-12-08 09:11:43 MSK DEBUG1 syncThread: thread done
> 2009-12-08 09:11:43 MSK DEBUG1 cleanupThread: thread done
> 2009-12-08 09:11:43 MSK DEBUG1 main: scheduler mainloop returned
> 2009-12-08 09:11:43 MSK DEBUG1 main: done
> 2009-12-08 09:11:43 MSK DEBUG1 slon: restart of worker
> 2009-12-08 09:11:43 MSK CONFIG main: slon version 1.2.17 starting up
> 2009-12-08 09:11:43 MSK CONFIG main: local node id = 2
> 2009-12-08 09:11:43 MSK CONFIG main: launching sched_start_mainloop
> 2009-12-08 09:11:43 MSK CONFIG main: loading current cluster  
> configuration
> 2009-12-08 09:11:43 MSK CONFIG storeNode: no_id=1  
> no_comment='director'
> 2009-12-08 09:11:43 MSK CONFIG storePath: pa_server=1 pa_client=2
> pa_conninfo="dbname=plus1 host=192.168.1.137 port=5432 user=slony"
> pa_connretry=10
> 2009-12-08 09:11:43 MSK CONFIG storeListen: li_origin=1 li_receiver=2
> li_provider=1
> 2009-12-08 09:11:43 MSK CONFIG storeSet: set_id=1 set_origin=1
> set_comment='A replication set so boring no one thought to give it a
> name'
> 2009-12-08 09:11:43 MSK WARN   remoteWorker_wakeup: node 1 - no  
> worker thread
> 2009-12-08 09:11:43 MSK CONFIG storeSubscribe: sub_set=1
> sub_provider=1 sub_forward='t'
> 2009-12-08 09:11:43 MSK WARN   remoteWorker_wakeup: node 1 - no  
> worker thread
> 2009-12-08 09:11:43 MSK CONFIG enableSubscription: sub_set=1
> 2009-12-08 09:11:43 MSK WARN   remoteWorker_wakeup: node 1 - no  
> worker thread
> 2009-12-08 09:11:43 MSK CONFIG main: configuration complete -  
> starting threads
> 2009-12-08 09:11:43 MSK DEBUG1 localListenThread: thread starts
> 2009-12-08 09:11:43 MSK FATAL  localListenThread: "select
> "_slony".cleanupNodelock(); insert into "_slony".sl_nodelock values (
>  2, 0, "pg_catalog".pg_backend_pid()); " - ERROR:  duplicate key
> value violates unique constraint "sl_nodelock-pkey"
>
> 2009-12-08 09:11:43 MSK DEBUG1 slon: shutdown requested
> 2009-12-08 09:12:03 MSK DEBUG1 slon: child termination timeout -  
> kill child
> 2009-12-08 09:12:03 MSK DEBUG1 slon: done
>
>
> -- 
> =======================================================
> Я в контакте: http://vkontakte.ru/id16323414
> Сила солому ломит, но не все в нашей  
> жизни - солома, да и сила далеко не все.
> _______________________________________________
> Slony1-general mailing list
> Slony1-general at lists.slony.info
> http://lists.slony.info/mailman/listinfo/slony1-general

--
Jeff Trout <jeff at jefftrout.com>
http://www.stuarthamm.net/
http://www.dellsmartexitin.com/