Tue Jan 18 22:57:47 PST 2005
- Previous message: [Slony1-commit] By cbbrowne: Mods per Steve Simms...
- Next message: [Slony1-commit] By cbbrowne: Added docs for a condition which breaks replication
- Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
Log Message:
-----------
Added in a script to test the state of a Slony-I cluster, along with
some documentation
Modified Files:
--------------
slony1-engine/doc/adminguide:
faq.sgml (r1.9 -> r1.10)
monitoring.sgml (r1.7 -> r1.8)
Added Files:
-----------
slony1-engine/tools:
test_slony_state.pl (r1.1)
-------------- next part --------------
--- /dev/null
+++ tools/test_slony_state.pl
@@ -0,0 +1,204 @@
+#!perl # -*- perl -*-
+# $Id: test_slony_state.pl,v 1.1 2005/01/18 22:57:34 cbbrowne Exp $
+# Christopher Browne
+# Copyright 2004
+# Afilias Canada
+
+# This script, given DSN parameters to access a Slony-I cluster,
+# submits a number of queries to test the state of the nodes in the
+# cluster.
+
+use Pg;
+use Getopt::Long;
+#use strict;
+
+my $sleep_seconds = 4;
+
+my $goodopts = GetOptions("help", "database=s", "host=s", "user=s", "cluster=s",
+ "password=s", "port=s");
+if (defined($opt_help)) {
+ show_usage();
+}
+my ($database,$user, $port, $cluster, $host, $password, $set, $finalquery);
+
+$database = $opt_database if (defined($opt_database));
+$port = 5432;
+$port = $opt_port if (defined($opt_port));
+$user = $opt_user if (defined($opt_user));
+$password = $opt_password if (defined($opt_password));
+$host = $opt_host if (defined($opt_host));
+$cluster = $opt_cluster if (defined($opt_cluster));
+
+#DBI: my $initialDSN = "dbi:Pg:dbname=$database;host=$host;port=$port";
+my $initialDSN = "dbname=$database host=$host port=$port";
+$initialDSN = $initialDSN . " password=$password" if defined($opt_password);
+
+print "DSN: $initialDSN\n===========================\n";
+
+# DBI: my $dbh = DBI->connect($initialDSN, $user, $password,
+# {RaiseError => 0, PrintError => 0, AutoCommit => 1});
+# die "connect: $DBI::errstr" if ( !defined($dbh) || $DBI::err );
+my $dbh = Pg::connectdb($initialDSN);
+
+print "Rummage for DSNs\n=============================\n";
+# Query to find live DSNs
+my $dsnsquery =
+"
+ select p.pa_server, p.pa_conninfo
+ from _$cluster.sl_path p
+ where exists (select * from _$cluster.sl_subscribe s where
+ (s.sub_provider = p.pa_server or s.sub_receiver = p.pa_server) and
+ sub_active = 't')
+ group by pa_server, pa_conninfo;
+";
+
+print "Query:\n$dsnsquery\n";
+$tq = $dbh->exec($dsnsquery);
+my %DSN;
+while (my @row = $tq->fetchrow) {
+ my ($node, $dsn) = @row;
+ $DSN{$node} = $dsn;
+}
+
+foreach my $node (keys %DSN) {
+ my $dsn = $DSN{$node};
+ test_node($node, $dsn);
+}
+
+sub test_node {
+ my ($node, $dsn) = @_;
+
+ print "\nTests for node $node - DSN = $dsn\n========================================\n";
+
+ my $listener_query = "select relpages, reltuples from pg_catalog.pg_class where relname = 'pg_listener';";
+ my $res = $dbh->exec($listener_query);
+ my ($relpages, $reltuples);
+ while (my @row = $res->fetchrow) {
+ ($relpages, $reltuples) = @row;
+ }
+ print qq{pg_listener info:
+Pages: $relpages
+Tuples: $reltuples
+};
+
+ print "\nSize Tests\n================================================\n";
+ my $sizequeries = qq{select relname, relpages, reltuples from pg_catalog.pg_class where relname in ('sl_log_1', 'sl_log_2', 'sl_seqlog') order by relname;};
+ $res = $dbh->exec($sizequeries);
+ while (my @row = $res->fetchrow) {
+ my ($relname, $relpages, $reltuples) = @row;
+ printf "%15s %8d %9f\n", $relname, $relpages, $reltuples;
+ }
+
+ print "\nListen Path Analysis\n===================================================\n";
+ my $inadequate_paths = qq{
+select li_origin, count(*) from _$cluster.sl_listen
+group by li_origin
+having count(*) < (select count(*) - 1 from _$cluster.sl_node );
+};
+ $res = $dbh->exec($inadequate_paths);
+ while (my @row = $res->fetchrow) {
+ my ($origin, $count) = @row;
+ printf "Problem node: %4d Listen path count for node: %d\n", $origin, $count;
+ $listenproblems++;
+ }
+ my $missing_paths = qq{
+ select * from (select n1.no_id as origin, n2.no_id as receiver
+ from _$cluster.sl_node n1, _$cluster.sl_node n2 where n1.no_id != n2.no_id) as foo
+ where not exists (select 1 from _$cluster.sl_listen
+ where li_origin = origin and li_receiver = receiver);
+};
+ $res = $dbh->exec($missing_paths);
+ while (my @row = $res->fetchrow) {
+ my ($origin, $receiver) = @row;
+ printf "(origin,receiver) where there is exists a direct path missing in sl_listen: (%d,%d)\n",
+ $origin, $receiver;
+ $listenproblems++;
+ }
+
+ # Each subscriber node must have a direct listen path
+ my $no_direct_path = qq{
+ select sub_set, sub_provider, sub_receiver from _$cluster.sl_subscribe where not exists
+ (select 1 from _$cluster.sl_listen
+ where li_origin = sub_provider and li_receiver = sub_receiver and li_provider = sub_provider);
+};
+ $res = $dbh->exec($no_direct_path);
+ while (my @row = $res->fetchrow) {
+ my ($set, $provider, $receiver) = @row;
+ printf "No direct path found for set %5d from provider %5d to receiver %5d\n", $set, $provider, $receiver;
+ $listenproblems++;
+ }
+
+ if ($listenproblems > 0) {
+ print "sl_listen problems found: $listenproblems\n";
+ } else {
+ print "No problems found with sl_listen\n";
+ }
+
+ print "\n--------------------------------------------------------------------------------\n";
+ print "Summary of event info\n";
+ printf "%7s %9s %9s %12s %12s\n", "Origin", "Min SYNC", "Max SYNC", "Min SYNC Age", "Max SYNC Age";
+ print "================================================================================\n";
+
+ my $event_summary = qq{
+ select ev_origin, min(ev_seqno), max(ev_seqno),
+ date_trunc('minutes', min(now() - ev_timestamp)),
+ date_trunc('minutes', max(now() - ev_timestamp))
+ from _$cluster.sl_event group by ev_origin;
+ };
+ $res = $dbh->exec($event_summary);
+ while (my @row = $res->fetchrow) {
+ my ($origin, $minsync, $maxsync, $minage, $maxage) = @row;
+ printf "%7s %9d %9d %12s %12s\n", $origin, $minsync, $maxsync, $minage, $maxage;
+ }
+ print "\n";
+
+ print "\n---------------------------------------------------------------------------------\n";
+ print "Summary of sl_confirm aging\n";
+ printf "%9s %9s %9s %9s %12s %12s\n", "Origin", "Receiver", "Min SYNC", "Max SYNC", "Age of latest SYNC", "Age of eldest SYNC";
+ print "=================================================================================\n";
+ my $confirm_summary = qq{
+
+ select con_origin, con_received, min(con_seqno) as minseq,
+ max(con_seqno) as maxseq, date_trunc('minutes', min(now()-con_timestamp)) as age1,
+ date_trunc('minutes', max(now()-con_timestamp)) as age2
+ from _$cluster.sl_confirm
+ group by con_origin, con_received
+ order by con_origin, con_received;
+ };
+
+ $res = $dbh->exec($confirm_summary);
+ while (my @row = $res->fetchrow) {
+ my ($origin, $receiver, $minsync, $maxsync, $minage, $maxage) = @row;
+ printf "%9s %9s %9s %9s %12s %12s\n", $origin, $receiver, $minsync, $maxsync, $minage, $maxage;
+ }
+ print "\n";
+
+ print "\n------------------------------------------------------------------------------\n";
+ print "\nListing of old open connections\n";
+ printf "%15s %15s %15s %12s %20s\n", "Database", "PID", "User", "Query Age", "Query";
+ print "================================================================================\n";
+
+ my $old_conn_query = qq{
+ select datname, procpid, usename, date_trunc('minutes', now() - query_start), substr(current_query,0,20)
+ from pg_stat_activity
+ where (now() - query_start) > '1:30'::interval and
+ current_query <> '<IDLE>'
+ order by query_start;
+ };
+
+ $res = $dbh->exec($old_conn_query);
+ while (my @row = $res->fetchrow) {
+ my ($db, $pid, $user, $age, $query) = @row;
+ printf "%15s %15d %15s %12s %20s\n", $db, $pid, $user, $age, $query;
+ }
+ print "\n";
+}
+
+sub show_usage {
+ my ($inerr) = @_;
+ if ($inerr) {
+ chomp $inerr;
+ print $inerr, "\n";
+ }
+ die "$0 --host --database --user --cluster --port=integer --password";
+}
Index: monitoring.sgml
===================================================================
RCS file: /usr/local/cvsroot/slony1/slony1-engine/doc/adminguide/monitoring.sgml,v
retrieving revision 1.7
retrieving revision 1.8
diff -Ldoc/adminguide/monitoring.sgml -Ldoc/adminguide/monitoring.sgml -u -w -r1.7 -r1.8
--- doc/adminguide/monitoring.sgml
+++ doc/adminguide/monitoring.sgml
@@ -62,6 +62,63 @@
about how the threads work, what to expect in the logs after you run a
</para>
</sect2>
+
+<sect2 id="testslonystate"> <title> test_slony_state.pl </title>
+
+<para> This script is in preliminary stages, and may be used to do
+some analysis of the state of a <productname>Slony-I</productname>
+cluster.</para>
+
+<para> You specify arguments including <option>database</option>,
+<option>host</option>, <option>user</option>,
+<option>cluster</option>, <option>password</option>, and
+<option>port</option> to connect to any of the nodes on a cluster.</para>
+
+<para> The script then rummages through <envar>sl_path</envar> to find
+all of the nodes in the cluster, and the DSNs to allow it to, in turn,
+connect to each of them.</para>
+
+<para> For each node, the script examines the state of things,
+including such things as:
+
+<itemizedlist>
+<listitem><para> Checking <envar> sl_listen </envar> for some
+<quote>analytically determinable</quote> problems. It lists paths
+that are not covered.
+
+<listitem><para> Providing a summary of events by origin node
+
+<para> If a node hasn't submitted any events in a while, that likely suggests a problem.
+
+<listitem><para> Summarizes the <quote>aging</quote> of table <envar>sl_confirm</envar>
+
+<para> If one or another of the nodes in the cluster hasn't reported
+back recently, that tends to lead to cleanups of tables like <envar>
+sl_log_1 </envar> and <envar> sl_seqlog </envar> not taking place.
+
+<listitem><para> Summarizes what transactions have been running for a
+long time
+
+<para> This only works properly if the statistics collector is
+configured to collect command strings, as controlled by the option
+<option> stats_command_string = true </option> in <filename>
+postgresql.conf </filename>.
+
+<para> If you have broken applications that hold connections open,
+this will find them.
+
+<para> If you have broken applications that hold connections open,
+that has several unsalutory effects as <link
+linkend="longtxnsareevil"> described in the FAQ</ulink>.
+
+</itemizedlist>
+
+<para> The script does not yet do much in the way of diagnosis work;
+it should be enhanced to be able to, based on some parameterization,
+notify someone of those problems it encounters.
+
+</sect2>
+
</sect1>
</article>
<!-- Keep this comment at the end of the file
Index: faq.sgml
===================================================================
RCS file: /usr/local/cvsroot/slony1/slony1-engine/doc/adminguide/faq.sgml,v
retrieving revision 1.9
retrieving revision 1.10
diff -Ldoc/adminguide/faq.sgml -Ldoc/adminguide/faq.sgml -u -w -r1.9 -r1.10
--- doc/adminguide/faq.sgml
+++ doc/adminguide/faq.sgml
@@ -860,12 +860,12 @@
</answer>
</qandaentry>
-<qandaentry>
+<qandaentry id="longtxnsareevil">
-<question><para> Replication has been slowing down, I'm seeing <command>
-FETCH 100 FROM LOG </command> queries running for a long time, <envar>
-sl_log_1 </envar> is growing, and performance is, well, generally
-getting steadily worse. </para>
+<question><para> Replication has been slowing down, I'm seeing
+<command> FETCH 100 FROM LOG </command> queries running for a long
+time, <envar> sl_log_1 </envar> is growing, and performance is, well,
+generally getting steadily worse. </para>
</question>
<answer> <para> There are actually a number of possible causes for
- Previous message: [Slony1-commit] By cbbrowne: Mods per Steve Simms...
- Next message: [Slony1-commit] By cbbrowne: Added docs for a condition which breaks replication
- Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
More information about the Slony1-commit mailing list