CVS User Account cvsuser
Tue Mar 15 16:41:07 PST 2005
Log Message:
-----------
Add a DBI-based state tester

Added Files:
-----------
    slony1-engine/tools:
        test_slony_state-dbi.pl (r1.1)

-------------- next part --------------
--- /dev/null
+++ tools/test_slony_state-dbi.pl
@@ -0,0 +1,317 @@
+#!perl   # -*- perl -*-
+# $Id: test_slony_state-dbi.pl,v 1.1 2005/03/15 16:41:04 cbbrowne Exp $
+# Christopher Browne
+# Copyright 2005
+# PostgreSQL Global Development Group
+
+# This script, given DSN parameters to access a Slony-I cluster,
+# submits a number of queries to test the state of the nodes in the
+# cluster.
+
+use DBI;
+use Getopt::Long;
+#use strict;
+my %PROBLEMS;
+
+my $sleep_seconds = 4;
+
+my $goodopts = GetOptions("help", "database=s", "host=s", "user=s", "cluster=s",
+			  "password=s", "port=s", "recipient=s", "mailprog=s");
+if (defined($opt_help)) {
+  show_usage();
+}
+my ($database,$user, $port, $cluster, $host, $password, $set, $finalquery);
+
+$database = $opt_database if (defined($opt_database));
+$port = 5432;
+$port = $opt_port if (defined($opt_port));
+$user = $opt_user if (defined($opt_user));
+$password = $opt_password if (defined($opt_password));
+$host = $opt_host if (defined($opt_host));
+$cluster = $opt_cluster if (defined($opt_cluster));
+$recipient = $opt_recipient if (defined($opt_recipient));
+$mailprog = $opt_mailprog if (defined($opt_mailprog));
+
+my $initialDSN = "dbi:Pg:dbname=$database;host=$host;port=$port";
+$initialDSN = $initialDSN . ";password=$password" if defined($opt_password);
+
+print "DSN: $initialDSN\n===========================\n";
+
+my $dbh = DBI->connect($initialDSN) or die "Unable to connect: $DBI::errstr\n";
+
+print "Rummage for DSNs\n=============================\n";
+# Query to find live DSNs
+my $dsnsquery = qq{
+   select p.pa_server, p.pa_conninfo
+   from "_$cluster".sl_path p
+   where exists (select * from "_$cluster".sl_subscribe s where
+                          (s.sub_provider = p.pa_server or s.sub_receiver = p.pa_server) and
+                          sub_active = 't')
+   group by pa_server, pa_conninfo;
+};
+
+print "Query:\n$dsnsquery\n";
+$tq = $dbh->prepare($dsnsquery);
+$tq->execute();
+
+my %DSN;
+while (my @row = $tq->fetchrow_array) {
+  my ($node, $dsn) = @row;
+  $DSN{$node} = $dsn;
+}
+
+foreach my $node (keys %DSN) {
+  my $dsn = $DSN{$node};
+  test_node($node, $dsn);
+}
+
+report_on_problems ();
+
+sub test_node {
+  my ($node, $dsn) = @_;
+
+  print "\nTests for node $node - DSN = $dsn\n========================================\n";
+
+  my $listener_query = "select relpages, reltuples from pg_catalog.pg_class where relname = 'pg_listener';";
+  my $res = $dbh->prepare($listener_query);
+  $res->execute();
+  my ($relpages, $reltuples);
+  while (my @row = $res->fetchrow_array) {
+    ($relpages, $reltuples) = @row;
+  }
+  print qq{pg_listener info:
+Pages: $relpages
+Tuples: $reltuples
+};
+
+  my $HILISTENPAGES = 5000;
+  if ($relpages > $HILISTENPAGES) {
+    add_problem ($node, "pg_listener relpages high - $relpages", 
+		 qq{Number of pages in table pg_listener is $relpages
+This is higher than the warning level of $HILISTENPAGES.
+
+Perhaps a long running transaction is preventing pg_listener from
+being vacuumed out?
+}); 
+  }
+
+  my $HILISTENTUPLES = 200000;
+  if ($reltuples > $HILISTENTUPLES) {
+    add_problem ($node, "pg_listener reltuples high - $reltuples",
+		 qq{Number of tuples in system table pg_listener is $reltuples.
+This is higher than the warning level of $HILISTENTUPLES.
+
+Perhaps a long running transaction is preventing pg_listener from
+being vacuumed out?
+});
+  }
+
+  my $HISLTUPLES=200000;
+  print "\nSize Tests\n================================================\n";
+  my $sizequeries = qq{select relname, relpages, reltuples from pg_catalog.pg_class where relname in ('sl_log_1', 'sl_log_2', 'sl_seqlog') order by relname;};
+  $res = $dbh->prepare($sizequeries);
+  $res->execute();
+  while (my @row = $res->fetchrow_array) {
+    my ($relname, $relpages, $reltuples) = @row;
+    printf "%15s  %8d %9f\n", $relname, $relpages, $reltuples;
+    if ($reltuples > $HISLTUPLES) {
+      add_problem($node, "$relname tuples = $reltuples > $HISLTUPLES",
+		  qq{Number of tuples in Slony-I table $relname is $reltuples which
+exceeds $HISLTUPLES.
+
+You may wish to investigate whether or not a node is down, or perhaps
+if sl_confirm entries have not been propagating properly.
+});
+
+    }
+  }
+
+  print "\nListen Path Analysis\n===================================================\n";
+  my $inadequate_paths = qq{
+select li_origin, count(*) from "_$cluster".sl_listen
+group by li_origin
+having count(*) < (select count(*) - 1 from "_$cluster".sl_node );
+};
+  $res = $dbh->prepare($inadequate_paths);
+  $res->execute();
+  while (my @row = $res->fetchrow_array) {
+    my ($origin, $count) = @row;
+    printf "Problem node: %4d  Listen path count for node: %d\n", $origin, $count;
+    $listenproblems++;
+  }
+  my $missing_paths = qq{
+   select * from (select n1.no_id as origin, n2.no_id as receiver
+     from "_$cluster".sl_node n1, "_$cluster".sl_node n2 where n1.no_id != n2.no_id) as foo
+   where not exists (select 1 from "_$cluster".sl_listen
+                     where li_origin = origin and li_receiver = receiver);
+};
+  $res = $dbh->prepare($missing_paths);
+  $res->execute();
+  my $allmissingpaths;
+  while (my @row = $res->fetchrow_array) {
+    my ($origin, $receiver) = @row;
+    my $string = sprintf "(origin,receiver) where there is exists a direct path missing in sl_listen: (%d,%d)\n",
+      $origin, $receiver;
+    print $string;
+    $listenproblems++;
+    $allmissingpaths .= $string;
+  }
+  if ($allmissingpaths) {
+    add_problem($node, "Missing sl_listen paths", qq{$allmissingpaths
+
+Please check contents of table sl_listen; some STORE LISTEN requests may be
+necessary.
+});
+  }
+
+  # Each subscriber node must have a direct listen path
+  my $no_direct_path = qq{
+    select sub_set, sub_provider, sub_receiver from "_$cluster".sl_subscribe where not exists
+        (select 1 from "_$cluster".sl_listen 
+         where li_origin = sub_provider and li_receiver = sub_receiver and li_provider = sub_provider);
+};
+  $res = $dbh->prepare($no_direct_path);
+  $res->execute();
+  while (my @row = $res->fetchrow_array) {
+    my ($set, $provider, $receiver) = @row;
+    my $string = sprintf "No direct path found for set %5d from provider %5d to receiver %5d\n", $set, $provider, $receiver;
+    print $string;
+    add_problem($node, "Missing path from $provider to $receiver", qq{Missing sl_listen entry - $string
+
+Please check contents of table sl_listen; some STORE LISTEN requests may be
+necessary.
+});
+    $listenproblems++;
+  }
+
+  if ($listenproblems > 0) {
+    print "sl_listen problems found: $listenproblems\n";
+  } else {
+    print "No problems found with sl_listen\n";
+  }
+
+  print "\n--------------------------------------------------------------------------------\n";
+  print "Summary of event info\n";
+  printf "%7s %9s %9s %12s %12s\n", "Origin", "Min SYNC", "Max SYNC", "Min SYNC Age", "Max SYNC Age";
+  print "================================================================================\n";
+
+  my $WANTAGE = "00:30:00";
+  my $event_summary = qq{
+  select ev_origin, min(ev_seqno), max(ev_seqno),
+         date_trunc('minutes', min(now() - ev_timestamp)),
+         date_trunc('minutes', max(now() - ev_timestamp)),
+         min(now() - ev_timestamp) > '$WANTAGE' as agehi
+     from "_$cluster".sl_event group by ev_origin;
+  };
+  $res = $dbh->prepare($event_summary);
+  $res->execute();
+  while (my @row = $res->fetchrow_array) {
+    my ($origin, $minsync, $maxsync, $minage, $maxage, $agehi) = @row;
+    printf "%7s %9d %9d %12s %12s %4s\n", $origin, $minsync, $maxsync, $minage, $maxage, $agehi;
+    if ($agehi eq 't') {
+      add_problem($origin, "Events not propagating to node $origin",
+		  qq{Events not propagating quickly in sl_event -
+For origin node $origin, earliest propagated event of age $minage > $WANTAGE
+
+Are slons running for both nodes?
+
+Could listen paths be missing so that events are not propagating?
+});
+    }
+  }
+  print "\n";
+
+  print "\n---------------------------------------------------------------------------------\n";
+  print "Summary of sl_confirm aging\n";
+  printf "%9s  %9s  %9s  %9s  %12s  %12s\n", "Origin", "Receiver", "Min SYNC", "Max SYNC", "Age of latest SYNC", "Age of eldest SYNC";
+  print "=================================================================================\n";
+  my $WANTCONFIRM = "00:30:00";
+  my $confirm_summary = qq{
+
+    select con_origin, con_received, min(con_seqno) as minseq,
+           max(con_seqno) as maxseq, date_trunc('minutes', min(now()-con_timestamp)) as age1,
+           date_trunc('minutes', max(now()-con_timestamp)) as age2,
+           min(now() - con_timestamp) > '$WANTCONFIRM' as tooold
+    from "_$cluster".sl_confirm
+    group by con_origin, con_received
+    order by con_origin, con_received;
+  };
+
+  $res = $dbh->prepare($confirm_summary);
+  $res->execute();
+  while (my @row = $res->fetchrow_array) {
+    my ($origin, $receiver, $minsync, $maxsync, $minage, $maxage, $agehi) = @row;
+    printf "%9s  %9s  %9s  %9s  %12s  %12s %4s\n", $origin, $receiver, $minsync, $maxsync, $minage, $maxage, $agehi;
+    if ($agehi eq 't') {
+      add_problem($origin, "Confirmations not propagating from $origin to $receiver",
+		  qq{Confirmations not propagating quickly in sl_confirm -
+
+For origin node $origin, receiver node $receiver, earliest propagated
+confirmation has age $minage > $WANTCONFIRM
+
+Are slons running for both nodes?
+
+Could listen paths be missing so that confirmations are not propagating?
+});
+    }
+  }
+  print "\n";
+
+  print "\n------------------------------------------------------------------------------\n";
+  print "\nListing of old open connections\n";
+  printf "%15s %15s %15s %12s %20s\n", "Database", "PID", "User", "Query Age", "Query";
+  print "================================================================================\n";
+
+  my $ELDERLY_TXN = "01:30:00";
+  my $old_conn_query = qq{
+     select datname, procpid, usename, date_trunc('minutes', now() - query_start), substr(current_query,0,20)
+     from pg_stat_activity
+     where  (now() - query_start) > '$ELDERLY_TXN'::interval and
+            current_query <> '<IDLE>'
+     order by query_start;
+  };
+
+  $res = $dbh->prepare($old_conn_query);
+  $res->execute();
+  while (my @row = $res->fetchrow_array) {
+    my ($db, $pid, $user, $age, $query) = @row;
+    printf "%15s %15d %15s %12s %20s\n", $db, $pid, $user, $age, $query;
+      add_problem($origin, "Old Transactions Kept Open",
+		  qq{Old Transaction still running with age $age > $ELDERLY_TXN
+
+Query: $query
+});
+  }
+  print "\n";
+
+}
+
+sub show_usage {
+  my ($inerr) = @_;
+  if ($inerr) {
+    chomp $inerr;
+    print $inerr, "\n";
+  }
+  die "$0  --host --database --user --cluster --port=integer --password --recipient --mailprog";
+}
+
+sub add_problem {
+  my ($node, $short, $long) = @_;
+  $PROBLEMS{"$node $short"} = $long;
+}
+
+sub report_on_problems {
+  my ($totalproblems, $message);
+  foreach my $key (sort keys %PROBLEMS) {
+    $totalproblems++;
+    $message .= "\nNode: $key\n================================================\n" . $PROBLEMS{$key} . "\n";
+  }
+  if ($totalproblems) {
+    open(MAIL, "|$mailprog -s \"Slony State Test Warning - Cluster $cluster\" $recipient");
+    print MAIL "\n";
+    print MAIL $message;
+    close (MAIL);
+    print "\n\nSending message thus - |$mailprog -s \"Slony State Test Warning - Cluster $cluster\" $recipient\n";
+    print "Message:\n\n$message\n";
+  }
+}


More information about the Slony1-commit mailing list