CVS User Account cvsuser
Tue Jan 18 22:57:47 PST 2005
Log Message:
-----------
Added in a script to test the state of a Slony-I cluster, along with
some documentation

Modified Files:
--------------
    slony1-engine/doc/adminguide:
        faq.sgml (r1.9 -> r1.10)
        monitoring.sgml (r1.7 -> r1.8)

Added Files:
-----------
    slony1-engine/tools:
        test_slony_state.pl (r1.1)

-------------- next part --------------
--- /dev/null
+++ tools/test_slony_state.pl
@@ -0,0 +1,204 @@
+#!perl   # -*- perl -*-
+# $Id: test_slony_state.pl,v 1.1 2005/01/18 22:57:34 cbbrowne Exp $
+# Christopher Browne
+# Copyright 2004
+# Afilias Canada
+
+# This script, given DSN parameters to access a Slony-I cluster,
+# submits a number of queries to test the state of the nodes in the
+# cluster.
+
+use Pg;
+use Getopt::Long;
+#use strict;
+
+my $sleep_seconds = 4;
+
+my $goodopts = GetOptions("help", "database=s", "host=s", "user=s", "cluster=s",
+			  "password=s", "port=s");
+if (defined($opt_help)) {
+  show_usage();
+}
+my ($database,$user, $port, $cluster, $host, $password, $set, $finalquery);
+
+$database = $opt_database if (defined($opt_database));
+$port = 5432;
+$port = $opt_port if (defined($opt_port));
+$user = $opt_user if (defined($opt_user));
+$password = $opt_password if (defined($opt_password));
+$host = $opt_host if (defined($opt_host));
+$cluster = $opt_cluster if (defined($opt_cluster));
+
+#DBI: my $initialDSN = "dbi:Pg:dbname=$database;host=$host;port=$port";
+my $initialDSN = "dbname=$database host=$host port=$port";
+$initialDSN = $initialDSN . " password=$password" if defined($opt_password);
+
+print "DSN: $initialDSN\n===========================\n";
+
+# DBI: my $dbh = DBI->connect($initialDSN, $user, $password,
+# 		       {RaiseError => 0, PrintError => 0, AutoCommit => 1});
+# die "connect: $DBI::errstr" if ( !defined($dbh) || $DBI::err );
+my $dbh = Pg::connectdb($initialDSN);
+
+print "Rummage for DSNs\n=============================\n";
+# Query to find live DSNs
+my $dsnsquery =
+"
+   select p.pa_server, p.pa_conninfo
+   from _$cluster.sl_path p
+   where exists (select * from _$cluster.sl_subscribe s where
+                          (s.sub_provider = p.pa_server or s.sub_receiver = p.pa_server) and
+                          sub_active = 't')
+   group by pa_server, pa_conninfo;
+";
+
+print "Query:\n$dsnsquery\n";
+$tq = $dbh->exec($dsnsquery);
+my %DSN;
+while (my @row = $tq->fetchrow) {
+  my ($node, $dsn) = @row;
+  $DSN{$node} = $dsn;
+}
+
+foreach my $node (keys %DSN) {
+  my $dsn = $DSN{$node};
+  test_node($node, $dsn);
+}
+
+sub test_node {
+  my ($node, $dsn) = @_;
+
+  print "\nTests for node $node - DSN = $dsn\n========================================\n";
+
+  my $listener_query = "select relpages, reltuples from pg_catalog.pg_class where relname = 'pg_listener';";
+  my $res = $dbh->exec($listener_query);
+  my ($relpages, $reltuples);
+  while (my @row = $res->fetchrow) {
+    ($relpages, $reltuples) = @row;
+  }
+  print qq{pg_listener info:
+Pages: $relpages
+Tuples: $reltuples
+};
+
+  print "\nSize Tests\n================================================\n";
+  my $sizequeries = qq{select relname, relpages, reltuples from pg_catalog.pg_class where relname in ('sl_log_1', 'sl_log_2', 'sl_seqlog') order by relname;};
+  $res = $dbh->exec($sizequeries);
+  while (my @row = $res->fetchrow) {
+    my ($relname, $relpages, $reltuples) = @row;
+    printf "%15s  %8d %9f\n", $relname, $relpages, $reltuples;
+  }
+
+  print "\nListen Path Analysis\n===================================================\n";
+  my $inadequate_paths = qq{
+select li_origin, count(*) from _$cluster.sl_listen
+group by li_origin
+having count(*) < (select count(*) - 1 from _$cluster.sl_node );
+};
+  $res = $dbh->exec($inadequate_paths);
+  while (my @row = $res->fetchrow) {
+    my ($origin, $count) = @row;
+    printf "Problem node: %4d  Listen path count for node: %d\n", $origin, $count;
+    $listenproblems++;
+  }
+  my $missing_paths = qq{
+   select * from (select n1.no_id as origin, n2.no_id as receiver
+     from _$cluster.sl_node n1, _$cluster.sl_node n2 where n1.no_id != n2.no_id) as foo
+   where not exists (select 1 from _$cluster.sl_listen
+                     where li_origin = origin and li_receiver = receiver);
+};
+  $res = $dbh->exec($missing_paths);
+  while (my @row = $res->fetchrow) {
+    my ($origin, $receiver) = @row;
+    printf "(origin,receiver) where there is exists a direct path missing in sl_listen: (%d,%d)\n", 
+      $origin, $receiver;
+    $listenproblems++;
+  }
+
+  # Each subscriber node must have a direct listen path
+  my $no_direct_path = qq{
+    select sub_set, sub_provider, sub_receiver from _$cluster.sl_subscribe where not exists
+        (select 1 from _$cluster.sl_listen 
+         where li_origin = sub_provider and li_receiver = sub_receiver and li_provider = sub_provider);
+};
+  $res = $dbh->exec($no_direct_path);
+  while (my @row = $res->fetchrow) {
+    my ($set, $provider, $receiver) = @row;
+    printf "No direct path found for set %5d from provider %5d to receiver %5d\n", $set, $provider, $receiver;
+    $listenproblems++;
+  }
+
+  if ($listenproblems > 0) {
+    print "sl_listen problems found: $listenproblems\n";
+  } else {
+    print "No problems found with sl_listen\n";
+  }
+
+  print "\n--------------------------------------------------------------------------------\n";
+  print "Summary of event info\n";
+  printf "%7s %9s %9s %12s %12s\n", "Origin", "Min SYNC", "Max SYNC", "Min SYNC Age", "Max SYNC Age";
+  print "================================================================================\n";
+
+  my $event_summary = qq{
+  select ev_origin, min(ev_seqno), max(ev_seqno),
+         date_trunc('minutes', min(now() - ev_timestamp)),
+         date_trunc('minutes', max(now() - ev_timestamp))
+     from _$cluster.sl_event group by ev_origin;
+  };
+  $res = $dbh->exec($event_summary);
+  while (my @row = $res->fetchrow) {
+    my ($origin, $minsync, $maxsync, $minage, $maxage) = @row;
+    printf "%7s %9d %9d %12s %12s\n", $origin, $minsync, $maxsync, $minage, $maxage;
+  }
+  print "\n";
+
+  print "\n---------------------------------------------------------------------------------\n";
+  print "Summary of sl_confirm aging\n";
+  printf "%9s  %9s  %9s  %9s  %12s  %12s\n", "Origin", "Receiver", "Min SYNC", "Max SYNC", "Age of latest SYNC", "Age of eldest SYNC";
+  print "=================================================================================\n";
+  my $confirm_summary = qq{
+
+    select con_origin, con_received, min(con_seqno) as minseq,
+           max(con_seqno) as maxseq, date_trunc('minutes', min(now()-con_timestamp)) as age1,
+           date_trunc('minutes', max(now()-con_timestamp)) as age2
+    from _$cluster.sl_confirm
+    group by con_origin, con_received
+    order by con_origin, con_received;
+  };
+
+  $res = $dbh->exec($confirm_summary);
+  while (my @row = $res->fetchrow) {
+    my ($origin, $receiver, $minsync, $maxsync, $minage, $maxage) = @row;
+    printf "%9s  %9s  %9s  %9s  %12s  %12s\n", $origin, $receiver, $minsync, $maxsync, $minage, $maxage;
+  }
+  print "\n";
+
+  print "\n------------------------------------------------------------------------------\n";
+  print "\nListing of old open connections\n";
+  printf "%15s %15s %15s %12s %20s\n", "Database", "PID", "User", "Query Age", "Query";
+  print "================================================================================\n";
+
+  my $old_conn_query = qq{
+     select datname, procpid, usename, date_trunc('minutes', now() - query_start), substr(current_query,0,20)
+     from pg_stat_activity
+     where  (now() - query_start) > '1:30'::interval and
+            current_query <> '<IDLE>'
+     order by query_start;
+  };
+
+  $res = $dbh->exec($old_conn_query);
+  while (my @row = $res->fetchrow) {
+    my ($db, $pid, $user, $age, $query) = @row;
+    printf "%15s %15d %15s %12s %20s\n", $db, $pid, $user, $age, $query;
+  }
+  print "\n";
+}
+
+sub show_usage {
+  my ($inerr) = @_;
+  if ($inerr) {
+    chomp $inerr;
+    print $inerr, "\n";
+  }
+  die "$0  --host --database --user --cluster --port=integer --password";
+}
Index: monitoring.sgml
===================================================================
RCS file: /usr/local/cvsroot/slony1/slony1-engine/doc/adminguide/monitoring.sgml,v
retrieving revision 1.7
retrieving revision 1.8
diff -Ldoc/adminguide/monitoring.sgml -Ldoc/adminguide/monitoring.sgml -u -w -r1.7 -r1.8
--- doc/adminguide/monitoring.sgml
+++ doc/adminguide/monitoring.sgml
@@ -62,6 +62,63 @@
 about how the threads work, what to expect in the logs after you run a
 </para>
 </sect2>
+
+<sect2 id="testslonystate"> <title> test_slony_state.pl </title>
+
+<para> This script is in preliminary stages, and may be used to do
+some analysis of the state of a <productname>Slony-I</productname>
+cluster.</para>
+
+<para> You specify arguments including <option>database</option>,
+<option>host</option>, <option>user</option>,
+<option>cluster</option>, <option>password</option>, and
+<option>port</option> to connect to any of the nodes on a cluster.</para>
+
+<para> The script then rummages through <envar>sl_path</envar> to find
+all of the nodes in the cluster, and the DSNs to allow it to, in turn,
+connect to each of them.</para>
+
+<para> For each node, the script examines the state of things,
+including such things as:
+
+<itemizedlist>
+<listitem><para> Checking <envar> sl_listen </envar> for some
+<quote>analytically determinable</quote> problems.  It lists paths
+that are not covered.  
+
+<listitem><para> Providing a summary of events by origin node
+
+<para> If a node hasn't submitted any events in a while, that likely suggests a problem.
+
+<listitem><para> Summarizes the <quote>aging</quote> of table <envar>sl_confirm</envar>
+
+<para> If one or another of the nodes in the cluster hasn't reported
+back recently, that tends to lead to cleanups of tables like <envar>
+sl_log_1 </envar> and <envar> sl_seqlog </envar> not taking place.
+
+<listitem><para> Summarizes what transactions have been running for a
+long time
+
+<para> This only works properly if the statistics collector is
+configured to collect command strings, as controlled by the option
+<option> stats_command_string = true </option> in <filename>
+postgresql.conf </filename>.
+
+<para> If you have broken applications that hold connections open,
+this will find them.
+
+<para> If you have broken applications that hold connections open,
+that has several unsalutory effects as <link
+linkend="longtxnsareevil"> described in the FAQ</ulink>.
+
+</itemizedlist>
+
+<para> The script does not yet do much in the way of diagnosis work;
+it should be enhanced to be able to, based on some parameterization,
+notify someone of those problems it encounters.
+
+</sect2>
+
 </sect1>
 </article>
 <!-- Keep this comment at the end of the file
Index: faq.sgml
===================================================================
RCS file: /usr/local/cvsroot/slony1/slony1-engine/doc/adminguide/faq.sgml,v
retrieving revision 1.9
retrieving revision 1.10
diff -Ldoc/adminguide/faq.sgml -Ldoc/adminguide/faq.sgml -u -w -r1.9 -r1.10
--- doc/adminguide/faq.sgml
+++ doc/adminguide/faq.sgml
@@ -860,12 +860,12 @@
 </answer>
 </qandaentry>
 
-<qandaentry> 
+<qandaentry id="longtxnsareevil">
 
-<question><para> Replication has been slowing down, I'm seeing <command>
-FETCH 100 FROM LOG </command> queries running for a long time, <envar>
-sl_log_1 </envar> is growing, and performance is, well, generally
-getting steadily worse. </para>
+<question><para> Replication has been slowing down, I'm seeing
+<command> FETCH 100 FROM LOG </command> queries running for a long
+time, <envar> sl_log_1 </envar> is growing, and performance is, well,
+generally getting steadily worse. </para>
 </question>
 
 <answer> <para> There are actually a number of possible causes for


More information about the Slony1-commit mailing list