Chris Browne cbbrowne at lists.slony.info
Fri Aug 1 13:18:19 PDT 2008
Update of /home/cvsd/slony1/slony1-engine/src/slon
In directory main.slony.info:/tmp/cvs-serv25935

Modified Files:
	remote_worker.c 
Log Message:
Add in additional performance monitoring, indicating how much time
and how many queries are being spent against the provider + subscriber.

This should enable people to more readily infer where bottlenecks are
themselves, rather than having to go to Jan & Chris and have them infer
things *really* indirectly.


Index: remote_worker.c
===================================================================
RCS file: /home/cvsd/slony1/slony1-engine/src/slon/remote_worker.c,v
retrieving revision 1.173
retrieving revision 1.174
diff -C2 -d -r1.173 -r1.174
*** remote_worker.c	27 Jun 2008 20:16:04 -0000	1.173
--- remote_worker.c	1 Aug 2008 20:18:17 -0000	1.174
***************
*** 28,33 ****
  
  #define MAXGROUPSIZE 10000		/* What is the largest number of SYNCs we'd
! 								 * want to group together??? */
  
  /* ----------
   * Local definitions
--- 28,35 ----
  
  #define MAXGROUPSIZE 10000		/* What is the largest number of SYNCs we'd
! 					 * want to group together??? */
! 
  
+ void slon_terminate_worker(void);
  /* ----------
   * Local definitions
***************
*** 139,142 ****
--- 141,161 ----
  } WorkGroupLineCode;
  
+ typedef struct PerfMon_s PerfMon;   /* Structure for doing performance monitoring */
+ struct PerfMon_s 
+ {
+ 	struct timeval prev_t;
+ 	struct timeval now_t;
+ 	double prov_query_t;      /* Time spent running queries against the provider */
+ 	int prov_query_c;         /* Number of queries run against the provider */
+ 	double subscr_query_t;      /* Time spent running prep queries against the subscriber */
+ 	int subscr_query_c;         /* Number of prep queries run against the subscriber */
+ 	double subscr_iud__t;    /* Time spent running IUD against subscriber */
+ 	int subscr_iud__c;       /* Number of IUD requests run against subscriber */
+ 	double large_tuples_t;          /* Number of large tuples processed */
+ 	int large_tuples_c;          /* Number of large tuples processed */
+ 	int num_inserts;
+ 	int num_updates;
+ 	int num_deletes;
+ };
  
  struct ProviderInfo_s
***************
*** 241,244 ****
--- 260,274 ----
   * ----------
   */
+ /*
+  * Monitoring data structure
+  */
+ 
+ static void init_perfmon(PerfMon *pm);
+ static void start_monitored_event(PerfMon *pm);
+ static void monitor_provider_query(PerfMon *pm);
+ static void monitor_subscriber_query(PerfMon *pm);
+ static void monitor_subscriber_iud(PerfMon *pm);
+ static void monitor_largetuples(PerfMon *pm);
+ 
  static void adjust_provider_info(SlonNode *node,
  					 WorkerGroupData *wd, int cleanup);
***************
*** 2103,2112 ****
  					 * Move the message to the head of the queue
  					 */
! 					if (oldmsg != node->message_head)
  					{
  						DLLIST_REMOVE(node->message_head,
! 								node->message_tail, oldmsg);
  						DLLIST_ADD_HEAD(node->message_head,
! 								node->message_tail, oldmsg);
  					}
  				}
--- 2133,2142 ----
  					 * Move the message to the head of the queue
  					 */
! 					if ((SlonWorkMsg *) oldmsg != node->message_head)
  					{
  						DLLIST_REMOVE(node->message_head,
! 									  node->message_tail, (SlonWorkMsg *) oldmsg);
  						DLLIST_ADD_HEAD(node->message_head,
! 								node->message_tail, (SlonWorkMsg *) oldmsg);
  					}
  				}
***************
*** 2420,2423 ****
--- 2450,2470 ----
  		}
  	}
+ 	if (sub_provider < 0) {
+ 		rtcfg_unlock();
+ 		slon_log(SLON_ERROR, "remoteWorkerThread_%d: provider node for set %"
+ 				 "not found in runtime configuration\n",
+ 				 set_id);
+ 		slon_terminate_worker();
+ 		return -1;
+ 		
+ 	}
+ 	if (set_origin < 0) {
+ 		rtcfg_unlock();
+ 		slon_log(SLON_ERROR, "remoteWorkerThread_%d: origin node for set %"
+ 				 "not found in runtime configuration\n",
+ 				 set_id);
+ 		slon_terminate_worker();
+ 		return -1;
+ 	}
  	if (set == NULL)
  	{
***************
*** 2431,2435 ****
  	{
  		rtcfg_unlock();
! 		slon_log(SLON_ERROR, "remoteWorkerThread_%d: node %d "
  				 "not found in runtime configuration\n",
  				 node->no_id, sub_provider);
--- 2478,2482 ----
  	{
  		rtcfg_unlock();
! 		slon_log(SLON_ERROR, "remoteWorkerThread_%d: provider node %d "
  				 "not found in runtime configuration\n",
  				 node->no_id, sub_provider);
***************
*** 3499,3502 ****
--- 3546,3550 ----
  	slon_log(SLON_INFO, "copy_set %d done in %.3f seconds\n", set_id,
  			 TIMEVAL_DIFF(&tv_start, &tv_now));
+ 
  	return 0;
  }
***************
*** 3537,3542 ****
--- 3585,3592 ----
  	int			actionlist_len;
  	int64		min_ssy_seqno;
+ 	PerfMon pm;
  
  	gettimeofday(&tv_start, NULL);
+ 
  	slon_log(SLON_DEBUG2, "remoteWorkerThread_%d: SYNC " INT64_FORMAT
  			 " processing\n",
***************
*** 3547,3550 ****
--- 3597,3602 ----
  	dstring_init(&lsquery);
  
+ 	init_perfmon(&pm);
+ 
  	/*
  	 * If this slon is running in log archiving mode, open a temporary file
***************
*** 3590,3593 ****
--- 3642,3647 ----
  			sprintf(conn_symname, "subscriber_%d_provider_%d",
  					node->no_id, provider->no_id);
+ 
+ 
  			provider->conn = slon_connectdb(provider->pa_conninfo,
  											conn_symname);
***************
*** 3610,3613 ****
--- 3664,3668 ----
  								"select %s.registerNodeConnection(%d); ",
  								rtcfg_namespace, rtcfg_nodeid);
+ 			start_monitored_event(&pm);
  			if (query_execute(node, provider->conn->dbconn, &query) < 0)
  			{
***************
*** 3619,3622 ****
--- 3674,3679 ----
  				return provider->pa_connretry;
  			}
+ 			monitor_provider_query(&pm);
+ 
  			slon_log(SLON_DEBUG1, "remoteWorkerThread_%d: "
  					 "connected to data provider %d on '%s'\n",
***************
*** 3718,3722 ****
--- 3775,3783 ----
  		slon_appendquery(&query, "); ");
  
+ 		start_monitored_event(&pm);
  		res1 = PQexec(local_dbconn, dstring_data(&query));
+ 		slon_log(SLON_INFO, "about to monitor_subscriber_query - pulling big actionid list %d\n", provider);
+ 		monitor_subscriber_query(&pm);
+ 
  		if (PQresultStatus(res1) != PGRES_TUPLES_OK)
  		{
***************
*** 3773,3777 ****
--- 3834,3840 ----
  								rtcfg_namespace,
  								sub_set);
+ 			start_monitored_event(&pm);
  			res2 = PQexec(local_dbconn, dstring_data(&query));
+ 			monitor_subscriber_query(&pm);
  			if (PQresultStatus(res2) != PGRES_TUPLES_OK)
  			{
***************
*** 3946,3949 ****
--- 4009,4013 ----
  						rtcfg_namespace);
  	res1 = PQexec(local_dbconn, dstring_data(&query));
+ 
  	if (PQresultStatus(res1) != PGRES_TUPLES_OK)
  	{
***************
*** 4031,4035 ****
--- 4095,4102 ----
  					if (wgline->log.n_used > 0)
  					{
+ 					  start_monitored_event(&pm);
  						res1 = PQexec(local_dbconn, dstring_data(&(wgline->log)));
+ 
+ 					  monitor_subscriber_iud(&pm);
  						if (PQresultStatus(res1) == PGRES_EMPTY_QUERY)
  						{
***************
*** 4049,4053 ****
--- 4116,4122 ----
  					}
  
+ 					start_monitored_event(&pm);
  					res1 = PQexec(local_dbconn, dstring_data(&(wgline->data)));
+ 					monitor_subscriber_iud(&pm);
  					if (PQresultStatus(res1) == PGRES_EMPTY_QUERY)
  					{
***************
*** 4228,4232 ****
--- 4297,4303 ----
  						 "  group by 1; ");
  
+ 		start_monitored_event(&pm);
  		res1 = PQexec(provider->conn->dbconn, dstring_data(&query));
+ 		monitor_subscriber_iud(&pm);
  		if (PQresultStatus(res1) != PGRES_TUPLES_OK)
  		{
***************
*** 4252,4255 ****
--- 4323,4327 ----
  								rtcfg_namespace,
  						   seql_seqid, node->no_id, seqbuf, seql_last_value);
+ 			start_monitored_event(&pm);
  			if (query_execute(node, local_dbconn, &query) < 0)
  			{
***************
*** 4260,4263 ****
--- 4332,4336 ----
  				return 60;
  			}
+ 			monitor_subscriber_iud(&pm);
  
  			/*
***************
*** 4344,4347 ****
--- 4417,4432 ----
  			 node->no_id, event->ev_seqno,
  			 TIMEVAL_DIFF(&tv_start, &tv_now));
+ 
+  	slon_log(SLON_INFO, 
+ 			 "remoteWorkerThread_%d: SYNC " INT64_FORMAT " Timing: " 
+ 			 " pqexec (s/count)" 
+ 			 "- provider %.3f/%d " 
+ 			 "- subscriber %.3f/%d " 
+ 			 "- IUD %.3f/%d\n",
+ 			 node->no_id, event->ev_seqno, 
+ 			 pm.prov_query_t, pm.prov_query_c, 
+ 			 pm.subscr_query_t, pm.prov_query_c,
+ 			 pm.subscr_iud__t, pm.subscr_iud__c);
+ 
  	return 0;
  }
***************
*** 4383,4393 ****
  	int			line_ncmds;
  
! 	int			num_inserts,
! 				num_deletes,
! 				num_updates;
  
! 	num_inserts = 0;
! 	num_deletes = 0;
! 	num_updates = 0;
  
  	dstring_init(&query);
--- 4468,4474 ----
  	int			line_ncmds;
  
! 	PerfMon pm;
  
! 	init_perfmon(&pm);
  
  	dstring_init(&query);
***************
*** 4618,4622 ****
  					/*
  					 * First make sure that the overall memory usage is inside
! 					 * bouds.
  					 */
  					if (wd->workdata_largemem > sync_max_largemem)
--- 4699,4703 ----
  					/*
  					 * First make sure that the overall memory usage is inside
! 					 * bounds.
  					 */
  					if (wd->workdata_largemem > sync_max_largemem)
***************
*** 4759,4762 ****
--- 4840,4844 ----
  					if (log_cmdsize >= sync_max_rowsize)
  					{
+ 						start_monitored_event(&pm);
  						(void) slon_mkquery(&query2,
  											"select log_cmddata "
***************
*** 4776,4779 ****
--- 4858,4862 ----
  										log_origin, log_txid, log_actionseq);
  						res2 = PQexec(dbconn, dstring_data(&query2));
+ 						monitor_largetuples(&pm);
  						if (PQresultStatus(res2) != PGRES_TUPLES_OK)
  						{
***************
*** 4839,4843 ****
  											 wd->tab_fqname[log_tableid],
  											 log_cmddata);
! 							num_inserts++;
  							break;
  
--- 4922,4926 ----
  											 wd->tab_fqname[log_tableid],
  											 log_cmddata);
! 							pm.num_inserts++;
  							break;
  
***************
*** 4847,4851 ****
  											 wd->tab_fqname[log_tableid],
  											 log_cmddata);
! 							num_updates++;
  							break;
  
--- 4930,4934 ----
  											 wd->tab_fqname[log_tableid],
  											 log_cmddata);
! 							pm.num_updates++;
  							break;
  
***************
*** 4855,4859 ****
  											 wd->tab_fqname[log_tableid],
  											 log_cmddata);
! 							num_deletes++;
  							break;
  					}
--- 4938,4942 ----
  											 wd->tab_fqname[log_tableid],
  											 log_cmddata);
! 							pm.num_deletes++;
  							break;
  					}
***************
*** 4959,4967 ****
  
  		slon_log(SLON_INFO, "remoteHelperThread_%d_%d: inserts=%d updates=%d deletes=%d\n",
! 		node->no_id, provider->no_id, num_inserts, num_updates, num_deletes);
  
! 		num_inserts = 0;
! 		num_deletes = 0;
! 		num_updates = 0;
  
  		/*
--- 5042,5052 ----
  
  		slon_log(SLON_INFO, "remoteHelperThread_%d_%d: inserts=%d updates=%d deletes=%d\n",
! 				 node->no_id, provider->no_id, pm.num_inserts, pm.num_updates, pm.num_deletes);
  
!  	slon_log(SLON_INFO, 
! 			 "remoteWorkerThread_%d: sync_helper timing: " 
! 			 " large tuples %.3f/%d\n", 
! 			 node->no_id, 
! 			 pm.large_tuples_t, pm.large_tuples_c);
  
  		/*
***************
*** 5650,5655 ****
  check_set_subscriber(int set_id, int node_id, PGconn *local_dbconn)
  {
- 
- 
  	SlonDString query1;
  	PGresult   *res;
--- 5735,5738 ----
***************
*** 5675,5679 ****
--- 5758,5806 ----
  	PQclear(res);
  	return 1;
+ }
  
+ static void init_perfmon(PerfMon *perf_info) {
+   perf_info->prov_query_t = 0.0;
+   perf_info->prov_query_c = 0;
+   perf_info->subscr_query_t = 0.0;
+   perf_info->subscr_query_c = 0;
+   perf_info->subscr_iud__t = 0.0;
+   perf_info->subscr_iud__c = 0;
+   perf_info->large_tuples_t = 0;
+   perf_info->large_tuples_c = 0;
+   perf_info->num_inserts = 0;
+   perf_info->num_updates = 0;
+   perf_info->num_deletes = 0;
+ }
+ static void start_monitored_event(PerfMon *perf_info) {
+   gettimeofday(&(perf_info->prev_t), NULL);
+ }
+ static void monitor_subscriber_query(PerfMon *perf_info) {
+   double diff;
+   gettimeofday(&(perf_info->now_t), NULL);
+   diff = TIMEVAL_DIFF(&(perf_info->prev_t), &(perf_info->now_t)); 
+   (perf_info->subscr_query_t) += diff;
+   (perf_info->subscr_query_c) ++;
+ }
+ static void monitor_provider_query(PerfMon *perf_info) {
+   double diff;
+   gettimeofday(&(perf_info->now_t), NULL);
+   diff = TIMEVAL_DIFF(&(perf_info->prev_t), &(perf_info->now_t)); 
+   (perf_info->prov_query_t) += diff;
+   (perf_info->prov_query_c) ++;
+ }
+ static void monitor_subscriber_iud(PerfMon *perf_info) {
+   double diff;
+   gettimeofday(&(perf_info->now_t), NULL);
+   diff = TIMEVAL_DIFF(&(perf_info->prev_t), &(perf_info->now_t)); 
+   (perf_info->subscr_iud__t) += diff;
+   (perf_info->subscr_iud__c) ++;
+ }
  
+ static void monitor_largetuples(PerfMon *perf_info) {
+   double diff;
+   gettimeofday(&(perf_info->now_t), NULL);
+   diff = TIMEVAL_DIFF(&(perf_info->prev_t), &(perf_info->now_t)); 
+   (perf_info->large_tuples_t) += diff;
+   (perf_info->large_tuples_c) ++;
  }



More information about the Slony1-commit mailing list