You can subscribe to this list here.
2010 |
Jan
|
Feb
|
Mar
|
Apr
(4) |
May
(28) |
Jun
(12) |
Jul
(11) |
Aug
(12) |
Sep
(5) |
Oct
(19) |
Nov
(14) |
Dec
(12) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2011 |
Jan
(18) |
Feb
(30) |
Mar
(115) |
Apr
(89) |
May
(50) |
Jun
(44) |
Jul
(22) |
Aug
(13) |
Sep
(11) |
Oct
(30) |
Nov
(28) |
Dec
(39) |
2012 |
Jan
(38) |
Feb
(18) |
Mar
(43) |
Apr
(91) |
May
(108) |
Jun
(46) |
Jul
(37) |
Aug
(44) |
Sep
(33) |
Oct
(29) |
Nov
(36) |
Dec
(15) |
2013 |
Jan
(35) |
Feb
(611) |
Mar
(5) |
Apr
(55) |
May
(30) |
Jun
(28) |
Jul
(458) |
Aug
(34) |
Sep
(9) |
Oct
(39) |
Nov
(22) |
Dec
(32) |
2014 |
Jan
(16) |
Feb
(16) |
Mar
(42) |
Apr
(179) |
May
(7) |
Jun
(6) |
Jul
(9) |
Aug
|
Sep
(4) |
Oct
|
Nov
(3) |
Dec
|
2015 |
Jan
|
Feb
|
Mar
|
Apr
(2) |
May
(4) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
S | M | T | W | T | F | S |
---|---|---|---|---|---|---|
|
|
1
|
2
(6) |
3
|
4
|
5
|
6
|
7
|
8
|
9
|
10
|
11
|
12
|
13
|
14
|
15
|
16
(2) |
17
(3) |
18
(1) |
19
|
20
|
21
(8) |
22
(6) |
23
(3) |
24
|
25
|
26
|
27
|
28
(1) |
|
|
|
|
|
From: Michael P. <mic...@us...> - 2011-02-18 07:06:32
|
Project "Postgres-XC". The branch, ha_support has been updated via d73ae5182149b08e0728edb96eee339e0c0498b7 (commit) from f42b489b49f366c78d816708d47b380f9db640d9 (commit) - Log ----------------------------------------------------------------- commit d73ae5182149b08e0728edb96eee339e0c0498b7 Author: Michael P <mic...@us...> Date: Fri Feb 18 15:49:39 2011 +0900 Mirroring and XCM (XC Cluster Manager) implementation This commit adds support for Datanode Mirroring. This permits to create multiple mirrors of a datanode. From the application, Mirrors of a Datanode are seen as a unique datanode. This is customizable with some new GUC parameters in the section DATANODE MIRRORING of postgresql.conf: - mirror_mode, switch for mirror mode off by default - mirror_count (Coordinator param), string to set the number of mirrors for each node It has a format 'num1,num2,...numN'. For Example '2,3' means that your cluster has 2 Datanodes. Datanode 1 has 2 mirrors and Datanode 2 has 3 mirrors The number of elements in this string has to be the same as num_data_nodes This is set at '1,1' by default - preferred_mirror_id (Coordinator param), an integer to set the mirror that is chosen for read operations on replicated tables (in correlation with preferred_data_node) - preferred_data_node (Coordinator param) has been modified as an integer, it permits to set a unique preferred node. Note: This parameter was disabled before. - is_primary_mirror (Datanode param), determines if the datanode is itself a primary or not Non-primary datanodes are not authorized to register on GTM From GTM, Mirrors are seen as a unique Datanode About XCM (XC cluster manager), roughly this allocates a portion of shared memory containing a lot of information about the cluster. A new configuration parameter is available called pgxc_ha.conf. A couple of new GUC parameters have been created for XCM activation in CLUSTER MANAGER section: - cluster_manager, switch to activate or not XCM for the node if activated, the node will try to take from shared memory the connection information from GTM. Pooler will do the same for connection parameters (host and port) for other datanodes. - pgxc_mirror_id (Datanode parameter), ID necessary to help the mirror to check if it is itself a primary or not. In case it is not a primary it does not register on GTM. XCM creates a couple of utilities: xcm_canstart - Test if the cluster can start xcm_freemember - Free the shared memory area xcm_getevent - Get status reports emitted from Postgres-XC nodes xcm_getflag - Get the status flag for a given component xcm_home - Sets the repository where shared memory information is saved xcm_initmember - Initialize the allocated shared memory based on information from pgxc_ha.conf xcm_printshm - Print to stdout the information in shared memory xcm_putevent - Inquire the status of the component and writes the status to stdout xcm_setflag - Change the status flag for a given component xcm_telhowto - Help for XCM Postgres-XC nodes report reports of failed Nodes (Coordinator or Datanodes) through the pooler when acquiring a connection fails. Reports about Failed GTM are also delivered. XCM has been written by Koichi Suzuki. Datanode Mirroring and XCM interface for XC have been written by me. diff --git a/src/Makefile b/src/Makefile index 7fbbcb3..cd6a6dc 100644 --- a/src/Makefile +++ b/src/Makefile @@ -19,6 +19,7 @@ all install installdirs uninstall distprep: # GTM should be built before backend because of dependancy $(MAKE) -C gtm $@ $(MAKE) -C interfaces $@ + $(MAKE) -C pgxc $@ $(MAKE) -C backend $@ $(MAKE) -C backend/utils/mb/conversion_procs $@ $(MAKE) -C backend/snowball $@ @@ -50,6 +51,7 @@ clean: $(MAKE) -C port $@ $(MAKE) -C timezone $@ $(MAKE) -C gtm $@ + $(MAKE) -C pgxc $@ $(MAKE) -C backend $@ $(MAKE) -C backend/snowball $@ $(MAKE) -C include $@ @@ -65,6 +67,7 @@ distclean maintainer-clean: $(MAKE) -C port $@ $(MAKE) -C timezone $@ $(MAKE) -C gtm $@ + $(MAKE) -C pgxc $@ $(MAKE) -C backend $@ $(MAKE) -C backend/snowball $@ $(MAKE) -C include $@ diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index a9bf1d6..5264708 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -17,7 +17,8 @@ #include "access/transam.h" #include "utils/elog.h" #include "miscadmin.h" -#include "pgxc/pgxc.h" +#include "nodes/pg_list.h" +#include "pgxc/mirror.h" /* Configuration variables */ char *GtmHost = "localhost"; @@ -42,6 +43,15 @@ InitGTM() { /* 256 bytes should be enough */ char conn_str[256]; + char *gtm_host = NULL; + int gtm_port; + + /* Get Connection parameters from Cluster manager if necessary */ + if (IsXCM) + { + gtm_port = PGXCMirror_GetLocalGTMPort(); + gtm_host = PGXCMirror_GetLocalGTMHost(); + } /* If this thread is postmaster itself, it contacts gtm identifying itself */ if (!IsUnderPostmaster) @@ -53,11 +63,20 @@ InitGTM() else if (IS_PGXC_DATANODE) remote_type = PGXC_NODE_DATANODE; - sprintf(conn_str, "host=%s port=%d pgxc_node_id=%d remote_type=%d postmaster=1", - GtmHost, GtmPort, PGXCNodeId, remote_type); + if (IsXCM) + sprintf(conn_str, "host=%s port=%d pgxc_node_id=%d remote_type=%d postmaster=1", + gtm_host, gtm_port, PGXCNodeId, remote_type); + else + sprintf(conn_str, "host=%s port=%d pgxc_node_id=%d remote_type=%d postmaster=1", + GtmHost, GtmPort, PGXCNodeId, remote_type); } else - sprintf(conn_str, "host=%s port=%d pgxc_node_id=%d", GtmHost, GtmPort, PGXCNodeId); + { + if (IsXCM) + sprintf(conn_str, "host=%s port=%d pgxc_node_id=%d", gtm_host, gtm_port, PGXCNodeId); + else + sprintf(conn_str, "host=%s port=%d pgxc_node_id=%d", GtmHost, GtmPort, PGXCNodeId); + } conn = PQconnectGTM(conn_str); if (GTMPQstatus(conn) != CONNECTION_OK) @@ -71,6 +90,10 @@ InitGTM() errno = save_errno; CloseGTM(); + + /* Report error to Cluster manager */ + if (IsXCM) + PGXCMirror_ReportGTMFail(); } } diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 40777bf..9cd45a1 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -40,11 +40,11 @@ #include "parser/parsetree.h" #include "rewrite/rewriteManip.h" #ifdef PGXC -#include "pgxc/pgxc.h" #include "access/gtm.h" #include "pgxc/planner.h" #include "tcop/tcopprot.h" #include "pgxc/poolmgr.h" +#include "pgxc/mirror.h" #endif #include "utils/rel.h" @@ -2075,8 +2075,16 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt) ListCell *nodeitem; RemoteQuery *step = makeNode(RemoteQuery); bool is_local = false; + bool nodenum_defined = false; List *raw_parsetree_list; ListCell *raw_parsetree_item; + int total_num_nodes; + int nodenum_real = 0; + + if (is_coordinator) + total_num_nodes = NumCoords; + else + total_num_nodes = NumDataNodes; if (list_length(nodelist) > 1) ereport(ERROR, @@ -2088,20 +2096,63 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt) (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to use EXECUTE DIRECT"))); - /* Check if execute direct is local and if node number is correct*/ + /* + * Check if execute direct is local and if node number is correct + * For the moment EXECUTE DIRECT does not support multiple nodes. + */ foreach(nodeitem, nodelist) { - int nodenum = intVal(lfirst(nodeitem)); + if (IsA(lfirst(nodeitem), PGXCMirror)) + { + PGXCMirror *node = lfirst(nodeitem); + int nodenum; + if (!IsPGXCMirrorMode) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Cannot use Mirror ID format in non-mirror mode"))); + if (is_coordinator) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Cannot use Mirror ID format for Coordinator"))); - if (nodenum < 1 || - (!is_coordinator && nodenum > NumDataNodes) || - (is_coordinator && nodenum > NumCoords)) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Node Number %d is incorrect", nodenum))); + if (node->data_node_id > NumDataNodes) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Node Number %d is incorrect", node->data_node_id))); + + if (node->mirror_id > PGXCMirror_GetMirrorCount(node->data_node_id)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Mirror Number %d is incorrect", node->mirror_id))); - if (nodenum == PGXCNodeId && is_coordinator) - is_local = true; + if (PGXCMirror_IsMirrorOffline(node->data_node_id, node->mirror_id)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Datanode %d Mirror %d is offline", + node->data_node_id, node->mirror_id))); + + nodenum = PGXCMirror_GetMirrorGlobalID(node->data_node_id, node->mirror_id); + if (nodenum > PGXCMirror_GetMirrorTotalCount() || + nodenum < 1) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Node Number %d is incorrect", nodenum))); + + nodenum_real = nodenum; + nodenum_defined = true; + } + else + { + int nodenum = intVal(lfirst(nodeitem)); + if (nodenum < 1 || + (!is_coordinator && nodenum > total_num_nodes) || + (is_coordinator && nodenum > total_num_nodes)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Node Number %d is incorrect", nodenum))); + if (nodenum == PGXCNodeId && is_coordinator) + is_local = true; + } } /* Transform the query into a raw parse list */ @@ -2208,7 +2259,26 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt) foreach(nodeitem, nodelist) { int nodenum = intVal(lfirst(nodeitem)); - step->exec_nodes->nodelist = lappend_int(step->exec_nodes->nodelist, nodenum); + + /* + * In Mirror mode, + * If node number is a simple datanode number + * for a SELECT command pick up only one node in the subset + * for a UTILITY command pick up the whole subset + * If node number is mirror format dn_id/mirror_id, keep it as it is. + */ + if (step->exec_direct_type == EXEC_DIRECT_LOCAL_UTILITY && + IsPGXCMirrorMode && + !nodenum_defined) + step->exec_nodes->nodelist = PGXCMirror_GetSubsetMirrors(nodenum, true); + else if (step->exec_direct_type == EXEC_DIRECT_SELECT && + IsPGXCMirrorMode && + !nodenum_defined) + step->exec_nodes->nodelist = PGXCMirror_GetSubsetMirrors(nodenum, false); + else if (nodenum_defined) /* Node Number where to run has already been calculated */ + step->exec_nodes->nodelist = lappend_int(step->exec_nodes->nodelist, nodenum_real); + else + step->exec_nodes->nodelist = lappend_int(step->exec_nodes->nodelist, nodenum); } step->sql_statement = pstrdup(query); diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index e1266ed..09186d3 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -183,6 +183,7 @@ static TypeName *TableFuncTypeName(List *columns); VariableSetStmt *vsetstmt; /* PGXC_BEGIN */ DistributeBy *distby; + PGXCMirror *pgxc_mirror; /* PGXC_END */ } @@ -425,6 +426,7 @@ static TypeName *TableFuncTypeName(List *columns); %type <ival> opt_frame_clause frame_extent frame_bound /* PGXC_BEGIN */ %type <distby> OptDistributeBy +%type <pgxc_mirror> mirror_elt /* PGXC_END */ @@ -485,7 +487,7 @@ static TypeName *TableFuncTypeName(List *columns); LEAST LEFT LEVEL LIKE LIMIT LISTEN LOAD LOCAL LOCALTIME LOCALTIMESTAMP LOCATION LOCK_P LOGIN_P /* PGXC_BEGIN */ - MAPPING MATCH MAXVALUE MINUTE_P MINVALUE MODE MODULO MONTH_P MOVE + MAPPING MATCH MAXVALUE MINUTE_P MINVALUE MIRROR MODE MODULO MONTH_P MOVE /* PGXC_END */ NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NOCREATEDB NOCREATEROLE NOCREATEUSER NODE NOINHERIT NOLOGIN_P NONE NOSUPERUSER @@ -6541,7 +6543,9 @@ opt_analyze: /***************************************************************************** * * QUERY: - * EXECUTE DIRECT ON (COORDINATOR num, ... | NODE num, ...) query + * EXECUTE DIRECT ON + * (COORDINATOR num, ... | + * NODE (node_num | node_num/mirror_num), ... ) query * *****************************************************************************/ @@ -6549,7 +6553,6 @@ ExecDirectStmt: EXECUTE DIRECT ON COORDINATOR coord_list DirectStmt { ExecDirectStmt *n = makeNode(ExecDirectStmt); n->coordinator = TRUE; - n->nodes = NIL; n->nodes = $5; n->query = $6; $$ = (Node *)n; @@ -6569,7 +6572,7 @@ DirectStmt: ; coord_list: - Iconst { $$ = list_make1(makeInteger($1)); } + Iconst { $$ = list_make1(makeInteger($1)); } | coord_list ',' Iconst { $$ = lappend($1, makeInteger($3)); } | '*' { @@ -6581,8 +6584,10 @@ coord_list: ; data_node_list: - Iconst { $$ = list_make1(makeInteger($1)); } + Iconst { $$ = list_make1(makeInteger($1)); } + | mirror_elt { $$ = list_make1($1); } | data_node_list ',' Iconst { $$ = lappend($1, makeInteger($3)); } + | data_node_list ',' mirror_elt { $$ = lappend($1, $3); } | '*' { int i; @@ -6592,6 +6597,17 @@ data_node_list: } ; +mirror_elt: + Iconst '/' Iconst + { + PGXCMirror *n = makeNode(PGXCMirror); + + n->data_node_id = $1; + n->mirror_id = $3; + $$ = (Node *) n; + } + ; + /***************************************************************************** * * QUERY: @@ -10425,6 +10441,9 @@ unreserved_keyword: | MAXVALUE | MINUTE_P | MINVALUE +/* PGXC_BEGIN */ + | MIRROR +/* PGXC_END */ | MODE /* PGXC_BEGIN */ | MODULO diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index d1aef8b..6729b15 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -38,6 +38,7 @@ #include "utils/tqual.h" #include "pgxc/poolmgr.h" #include "pgxc/locator.h" +#include "pgxc/mirror.h" #include "catalog/pgxc_class.h" #include "catalog/namespace.h" @@ -55,12 +56,9 @@ bool locatorInited = false; /* GUC parameter */ -char *PreferredDataNodes = NULL; +int PreferredDataNode = 0; int primary_data_node = 1; -/* Preferred to use when reading from replicated tables */ -static List *globalPreferredNodes = NIL; - /* * init_mapping_table - initializes a mapping table * @@ -88,10 +86,19 @@ List * GetAnyDataNode(void) { List *destList = NULL; - - /* try and pick from the preferred list */ - if (globalPreferredNodes != NULL) - return destList = lappend_int(NULL, linitial_int(globalPreferredNodes)); + /* + * Try and pick the preferred node. + * In Mirror mode, pick up the preferred Mirror. + * If a preferred datanode is defined but not a mirror, + * use one mirror randomly in the subset of mirrors of this Datanode. + * + * PGXCTODO: When tables defined in a subset of nodes is supported, + * we need also to check if those tables can use it. + */ + if (PreferredDataNode != 0 && !IsPGXCMirrorMode) + return destList = lappend_int(NULL, PreferredDataNode); + else if (PreferredDataNode != 0 && IsPGXCMirrorMode) + return destList = PGXCMirror_GetSubsetMirrors(PreferredDataNode, false); return destList = lappend_int(NULL, 1); } @@ -112,18 +119,14 @@ hash_range(char *key) int value; if (key == NULL || key == '\0') - { return 0; - } length = strlen(key); value = 0x238F13AF * length; for (i = 0; i < length; i++) - { value = value + ((key[i] << i * 5 % 24) & 0x7fffffff); - } return (1103515243 * value + 12345) % 65537 & HASH_MASK; } @@ -369,21 +372,47 @@ int GetRoundRobinNode(Oid relid) { int ret_node; - Relation rel = relation_open(relid, AccessShareLock); - Assert (rel->rd_locator_info->locatorType == LOCATOR_TYPE_REPLICATED || + Assert (rel->rd_locator_info->locatorType == LOCATOR_TYPE_REPLICATED || rel->rd_locator_info->locatorType == LOCATOR_TYPE_RROBIN); - ret_node = lfirst_int(rel->rd_locator_info->roundRobinNode); + if (IsPGXCMirrorMode) + { + /* + * PGXCTODO: Round Robin list contains also the list of mirrors + * But we have to be sure that a Global Datanode ID is + * returned. + * When node subsets are supported, this part should be + * modified to support node subsets correctly + */ + bool done = false; + while (!done) + { + ret_node = lfirst_int(rel->rd_locator_info->roundRobinNode); - /* Move round robin indicator to next node */ - if (rel->rd_locator_info->roundRobinNode->next != NULL) - rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->roundRobinNode->next; - else - /* reset to first one */ - rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->nodeList->head; + if (ret_node < NumDataNodes + 1) + done = true; + /* Move round robin indicator to next node */ + if (rel->rd_locator_info->roundRobinNode->next != NULL) + rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->roundRobinNode->next; + else + /* reset to first one */ + rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->nodeList->head; + } + } + else + { + ret_node = lfirst_int(rel->rd_locator_info->roundRobinNode); + + /* Move round robin indicator to next node */ + if (rel->rd_locator_info->roundRobinNode->next != NULL) + rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->roundRobinNode->next; + else + /* reset to first one */ + rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->nodeList->head; + } relation_close(rel, AccessShareLock); return ret_node; @@ -426,9 +455,17 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, long *partValue, switch (rel_loc_info->locatorType) { case LOCATOR_TYPE_REPLICATED: + { + int primary_node_num; + + /* In Mirror mode, primary mirror is selected in subset of datanodes */ + if (IsPGXCMirrorMode) + primary_node_num = PGXCMirror_GetPrimaryMirrorNum(primary_data_node); + else + primary_node_num = primary_data_node; if (accessType == RELATION_ACCESS_UPDATE || - accessType == RELATION_ACCESS_INSERT) + accessType == RELATION_ACCESS_INSERT) { /* we need to write to all synchronously */ exec_nodes->nodelist = list_copy(rel_loc_info->nodeList); @@ -437,11 +474,11 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, long *partValue, * Write to primary node first, to reduce chance of a deadlock * on replicated tables. If 0, do not use primary copy. */ - if (primary_data_node && exec_nodes->nodelist + if (primary_node_num && exec_nodes->nodelist && list_length(exec_nodes->nodelist) > 1) /* make sure more than 1 */ { - exec_nodes->primarynodelist = lappend_int(NULL, primary_data_node); - list_delete_int(exec_nodes->nodelist, primary_data_node); + exec_nodes->primarynodelist = lappend_int(NULL, primary_node_num); + list_delete_int(exec_nodes->nodelist, primary_node_num); } } else @@ -454,40 +491,71 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, long *partValue, * avoid distributed deadlock if updating the same row * concurrently */ - exec_nodes->nodelist = lappend_int(NULL, primary_data_node); + exec_nodes->nodelist = lappend_int(NULL, primary_node_num); } - else if (globalPreferredNodes != NULL) + else if (PreferredDataNode != 0) { - /* try and pick from the preferred list */ - foreach(prefItem, globalPreferredNodes) + /* + * Try and pick the preferred node. + * make sure it is valid for this relation. + * In Mirror mode, be sur to pick up the right node. + */ + foreach(stepItem, rel_loc_info->nodeList) { - /* make sure it is valid for this relation */ - foreach(stepItem, rel_loc_info->nodeList) + if (lfirst_int(stepItem) == PreferredDataNode) { - if (lfirst_int(stepItem) == lfirst_int(prefItem)) - { - exec_nodes->nodelist = lappend_int(NULL, lfirst_int(prefItem)); - break; - } + if (IsPGXCMirrorMode) + exec_nodes->nodelist = PGXCMirror_GetSubsetMirrors(PreferredDataNode, false); + else + exec_nodes->nodelist = lappend_int(NULL, PreferredDataNode); + + break; } } } if (exec_nodes->nodelist == NULL) - /* read from just one of them. Use round robin mechanism */ - exec_nodes->nodelist = lappend_int(NULL, GetRoundRobinNode(rel_loc_info->relid)); + { + int rr_node_num = GetRoundRobinNode(rel_loc_info->relid); + /* We are in READ case here, just pick up one of them with round robin */ + if (IsPGXCMirrorMode) + exec_nodes->nodelist = PGXCMirror_GetSubsetMirrors(rr_node_num, false); + else + exec_nodes->nodelist = lappend_int(NULL, rr_node_num); + } } break; - + } case LOCATOR_TYPE_HASH: if (partValue != NULL) - /* in prototype, all partitioned tables use same map */ - exec_nodes->nodelist = lappend_int(NULL, get_node_from_hash(hash_range_int(*partValue))); + { + int hash_node_num = get_node_from_hash(hash_range_int(*partValue)); + /* + * In prototype, all partitioned tables use same map. + * + * In Mirror mode, + * for a SELECT query, pick up one node in the subset of datanodes + * for a DML query, pick up all the nodes of the subset + */ + if ((IsPGXCMirrorMode && accessType == RELATION_ACCESS_INSERT) || + (IsPGXCMirrorMode && accessType == RELATION_ACCESS_UPDATE)) + exec_nodes->nodelist = PGXCMirror_GetSubsetMirrors(hash_node_num, true); + else if (IsPGXCMirrorMode && accessType == RELATION_ACCESS_READ) + exec_nodes->nodelist = PGXCMirror_GetSubsetMirrors(hash_node_num, false); + else + exec_nodes->nodelist = lappend_int(NULL, hash_node_num); + } else if (accessType == RELATION_ACCESS_INSERT) - /* Insert NULL to node 1 */ - exec_nodes->nodelist = lappend_int(NULL, 1); + { + /* Insert NULL to node 1 + * In mirror mode, pick up all the mirrors of one Datanode */ + if (IsPGXCMirrorMode) + exec_nodes->nodelist = PGXCMirror_GetSubsetMirrors(1, true); + else + exec_nodes->nodelist = lappend_int(NULL, 1); + } else /* Use all nodes for other types of access */ exec_nodes->nodelist = list_copy(rel_loc_info->nodeList); @@ -495,12 +563,33 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, long *partValue, case LOCATOR_TYPE_MODULO: if (partValue != NULL) - /* in prototype, all partitioned tables use same map */ - exec_nodes->nodelist = lappend_int(NULL, get_node_from_modulo(compute_modulo(*partValue))); + { + int modulo_node_num = get_node_from_modulo(compute_modulo(*partValue)); + /* + * In prototype, all partitioned tables use same map. + * + * In Mirror mode, + * for a SELECT query, pick up one node in the subset of datanodes + * for a DML query, pick up all the nodes of the subset + */ + if ((IsPGXCMirrorMode && accessType == RELATION_ACCESS_INSERT) || + (IsPGXCMirrorMode && RELATION_ACCESS_UPDATE)) + exec_nodes->nodelist = PGXCMirror_GetSubsetMirrors(modulo_node_num, true); + else if (IsPGXCMirrorMode && accessType == RELATION_ACCESS_READ) + exec_nodes->nodelist = PGXCMirror_GetSubsetMirrors(modulo_node_num, false); + else + exec_nodes->nodelist = lappend_int(NULL, modulo_node_num); + } else if (accessType == RELATION_ACCESS_INSERT) - /* Insert NULL to node 1 */ - exec_nodes->nodelist = lappend_int(NULL, 1); + { + /* Insert NULL to node 1 + * In mirror mode, pick up all the mirrors of one Datanode */ + if (IsPGXCMirrorMode) + exec_nodes->nodelist = PGXCMirror_GetSubsetMirrors(1, true); + else + exec_nodes->nodelist = lappend_int(NULL, 1); + } else /* Use all nodes for other types of access */ exec_nodes->nodelist = list_copy(rel_loc_info->nodeList); @@ -517,8 +606,12 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, long *partValue, /* round robin, get next one */ if (accessType == RELATION_ACCESS_INSERT) { + int rr_node_num = GetRoundRobinNode(rel_loc_info->relid); /* write to just one of them */ - exec_nodes->nodelist = lappend_int(NULL, GetRoundRobinNode(rel_loc_info->relid)); + if (IsPGXCMirrorMode) + exec_nodes->nodelist = PGXCMirror_GetSubsetMirrors(rr_node_num, true); + else + exec_nodes->nodelist = lappend_int(NULL, rr_node_num); } else { @@ -602,14 +695,19 @@ List * GetAllDataNodes(void) { int i; + int node_num = NumDataNodes; + List *nodeList = NIL; + + /* In Mirroring mode, all the mirrors are selected */ + if (IsPGXCMirrorMode) + node_num = PGXCMirror_GetMirrorTotalCount(); /* * PGXCTODO - add support for having nodes on a subset of nodes * For now, assume on all nodes */ - List *nodeList = NIL; - for (i = 1; i < NumDataNodes + 1; i++) + for (i = 1; i < node_num + 1; i++) { nodeList = lappend_int(nodeList, i); } @@ -709,6 +807,7 @@ RelationBuildLocator(Relation rel) /** PGXCTODO - add support for having nodes on a subset of nodes * For now, assume on all nodes + * In Mirror mode, this includes also Mirrors !! */ relationLocInfo->nodeList = GetAllDataNodes(); relationLocInfo->nodeCount = relationLocInfo->nodeList->length; @@ -725,6 +824,14 @@ RelationBuildLocator(Relation rel) * pick a random one to start with, * since each process will do this independently */ + /* + * PGXCTODO: Now mapping table uses all the nodes... + * In Mirror mode, NodeCount contains the TOTAL number of mirrors + * even if real number of Datanode is NumDataNodes. + * When changing mapping table to support subsets of nodes, + * this part should be changed in accordance to Mirror Mode also. + * GetRoundRobin has to return the global Datanode number, not a single mirror number. + */ srand(time(NULL)); offset = rand() % relationLocInfo->nodeCount + 1; relationLocInfo->roundRobinNode = relationLocInfo->nodeList->head; /* initialize */ diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 0c19756..7d01528 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -35,6 +35,8 @@ #include "pgxc/execRemote.h" #include "pgxc/locator.h" #include "pgxc/planner.h" +#include "pgxc/mirror.h" +#include "pgxc/poolmgr.h" #include "tcop/pquery.h" #include "utils/acl.h" #include "utils/builtins.h" @@ -167,7 +169,7 @@ static void InitXCWalkerContext(XCWalkerContext *context); static RemoteQuery *makeRemoteQuery(void); static void validate_part_col_updatable(const Query *query); static bool is_pgxc_safe_func(Oid funcid); - +static List *get_mirror_nodes(void); /* * Find position of specified substring in the string @@ -728,6 +730,10 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) errmsg("cursor \"%s\" is held from a previous transaction", cursor_name))); + if (IsPGXCMirrorMode) + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_STATE), + errmsg("CURRENT OF not yet supported in Mirroring mode"))); /* * The cursor must have a current result row: per the SQL spec, it's * an error if not. @@ -901,7 +907,13 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) { /* Take target node from last scan tuple of referenced step */ int curr_node = node->ss.ss_ScanTupleSlot->tts_dataNode; - context->query_step->exec_nodes->nodelist = lappend_int(context->query_step->exec_nodes->nodelist, curr_node); + if (IsPGXCMirrorMode) + context->query_step->exec_nodes->nodelist = + list_concat(context->query_step->exec_nodes->nodelist, + PGXCMirror_GetSubsetMirrors(curr_node, true)); + else + context->query_step->exec_nodes->nodelist = + lappend_int(context->query_step->exec_nodes->nodelist, curr_node); } FreeRelationLocInfo(rel_loc_info1); @@ -1690,7 +1702,8 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) table_usage_type = TABLE_USAGE_TYPE_USER_REPLICATED; context->query_step->exec_nodes->tableusagetype = table_usage_type; - } else if (context->conditions->partitioned_expressions) { + } + else if (context->conditions->partitioned_expressions) { /* probably we can determine nodes on execution time */ foreach(lc, context->conditions->partitioned_expressions) { Expr_Comparison *expr_comp = (Expr_Comparison *) lfirst(lc); @@ -1709,7 +1722,8 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) break; } } - } else { + } + else { /* run query on all nodes */ context->query_step->exec_nodes = makeNode(ExecNodes); context->query_step->exec_nodes->baselocatortype = @@ -1717,8 +1731,13 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) context->query_step->exec_nodes->tableusagetype = TABLE_USAGE_TYPE_USER; context->query_step->exec_nodes->primarynodelist = NULL; - context->query_step->exec_nodes->nodelist = - list_copy(rel_loc_info->nodeList); + /* In Mirror mode, get one mirror for a read operation */ + if (IsPGXCMirrorMode && context->accessType == RELATION_ACCESS_READ || + IsPGXCMirrorMode && context->accessType == RELATION_ACCESS_READ_FOR_UPDATE) + context->query_step->exec_nodes->nodelist = get_mirror_nodes(); + else + context->query_step->exec_nodes->nodelist = list_copy(rel_loc_info->nodeList); + context->query_step->exec_nodes->expr = NULL; context->query_step->exec_nodes->relid = NULL; context->query_step->exec_nodes->accesstype = context->accessType; @@ -1737,9 +1756,14 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) { Literal_Comparison *lit_comp = (Literal_Comparison *) lfirst(lc); - test_exec_nodes = GetRelationNodes( - lit_comp->rel_loc_info, &(lit_comp->constant), - RELATION_ACCESS_READ); + if (context->accessType == RELATION_ACCESS_UPDATE) + test_exec_nodes = GetRelationNodes( + lit_comp->rel_loc_info, &(lit_comp->constant), + RELATION_ACCESS_UPDATE); + else + test_exec_nodes = GetRelationNodes( + lit_comp->rel_loc_info, &(lit_comp->constant), + RELATION_ACCESS_READ); test_exec_nodes->tableusagetype = table_usage_type; if (context->query_step->exec_nodes == NULL) @@ -3334,3 +3358,24 @@ GetHashExecNodes(RelationLocInfo *rel_loc_info, ExecNodes **exec_nodes, const Ex } +/* + * In Mirror mode + * A query that has to be executed on all nodes just needs + * to execute on one mirror of each node. + */ +static List* +get_mirror_nodes(void) +{ + List *list_nodes = NIL; + List *res; + int count; + + for (count = 0; count < NumDataNodes; count++) + { + res = PGXCMirror_GetSubsetMirrors(count + 1, false); + + list_nodes = list_concat(list_nodes, res); + } + + return list_nodes; +} diff --git a/src/backend/pgxc/pool/Makefile b/src/backend/pgxc/pool/Makefile index f0701c5..b43d5b4 100644 --- a/src/backend/pgxc/pool/Makefile +++ b/src/backend/pgxc/pool/Makefile @@ -14,6 +14,6 @@ subdir = src/backend/pgxc/pool top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = pgxcnode.o execRemote.o poolmgr.o poolcomm.o postgresql_fdw.o poolutils.o +OBJS = pgxcnode.o execRemote.o poolmgr.o poolcomm.o postgresql_fdw.o poolutils.o mirror.o ../../../pgxc/xcm/libxcm.a include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 3673421..4570bdf 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -34,7 +34,7 @@ #include "utils/tuplesort.h" #include "utils/snapmgr.h" #include "pgxc/locator.h" -#include "pgxc/pgxc.h" +#include "pgxc/mirror.h" #define END_QUERY_TIMEOUT 20 #define DATA_NODE_FETCH_SIZE 1 @@ -112,6 +112,8 @@ stat_statement() static void stat_transaction(int node_count) { + int node_num = IsPGXCMirrorMode ? PGXCMirror_GetMirrorTotalCount() : NumDataNodes; + total_transactions++; if (autocommit) total_autocommit++; @@ -125,12 +127,12 @@ stat_transaction(int node_count) else statements_per_transaction[current_tran_statements]++; current_tran_statements = 0; - if (node_count > 0 && node_count <= NumDataNodes) + if (node_count > 0 && node_count <= node_num) { if (!nodes_per_transaction) { - nodes_per_transaction = (int *) malloc(NumDataNodes * sizeof(int)); - memset(nodes_per_transaction, 0, NumDataNodes * sizeof(int)); + nodes_per_transaction = (int *) malloc(node_num * sizeof(int)); + memset(nodes_per_transaction, 0, node_num * sizeof(int)); } nodes_per_transaction[node_count - 1]++; } @@ -175,9 +177,10 @@ stat_log() MAX_STATEMENTS_PER_TRAN, statements_per_transaction[MAX_STATEMENTS_PER_TRAN], statements_per_transaction[MAX_STATEMENTS_PER_TRAN] * 100 / total_transactions); if (nodes_per_transaction) { - int i; + int i; + int node_num = IsPGXCMirrorMode ? PGXCMirror_GetMirrorTotalCount() : NumDataNodes; - for (i = 0; i < NumDataNodes; i++) + for (i = 0; i < node_num; i++) elog(DEBUG1, "%d Nodes per Transaction: %d (%d%%)", i + 1, nodes_per_transaction[i], nodes_per_transaction[i] * 100 / total_transactions); } @@ -1103,9 +1106,10 @@ BufferConnection(PGXCNodeHandle *conn) */ if (combiner->tuplesortstate) { + int node_num = IsPGXCMirrorMode ? PGXCMirror_GetMirrorTotalCount() : NumDataNodes; combiner->connections[combiner->current_conn] = NULL; if (combiner->tapenodes == NULL) - combiner->tapenodes = (int*) palloc0(NumDataNodes * sizeof(int)); + combiner->tapenodes = (int*) palloc0(node_num * sizeof(int)); combiner->tapenodes[combiner->current_conn] = conn->nodenum; } else @@ -1242,7 +1246,7 @@ FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot) else combiner->current_conn = 0; } - else if (res = RESPONSE_DATAROW && have_tuple) + else if (res == RESPONSE_DATAROW && have_tuple) { /* * We already have a tuple and received another one, leave it till @@ -1514,11 +1518,12 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle ** connections, static void clear_write_node_list() { + int node_count = IsPGXCMirrorMode ? PGXCMirror_GetMirrorTotalCount() : NumDataNodes; + /* we just malloc once and use counter */ if (write_node_list == NULL) - { - write_node_list = (PGXCNodeHandle **) malloc(NumDataNodes * sizeof(PGXCNodeHandle *)); - } + write_node_list = (PGXCNodeHandle **) malloc(node_count * sizeof(PGXCNodeHandle *)); + write_node_count = 0; } @@ -2312,7 +2317,8 @@ PGXCNodeHandle** DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot, bool is_from) { int i, j; - int conn_count = list_length(nodelist) == 0 ? NumDataNodes : list_length(nodelist); + const int node_num = IsPGXCMirrorMode ? PGXCMirror_GetMirrorTotalCount() : NumDataNodes; + int conn_count = list_length(nodelist) == 0 ? node_num : list_length(nodelist); struct timeval *timeout = NULL; PGXCNodeAllHandles *pgxc_handles; PGXCNodeHandle **connections; @@ -2344,7 +2350,7 @@ DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot, bool is_ * So store connections in an array where index is node-1. * Unused items in the array should be NULL */ - copy_connections = (PGXCNodeHandle **) palloc0(NumDataNodes * sizeof(PGXCNodeHandle *)); + copy_connections = (PGXCNodeHandle **) palloc0(node_num * sizeof(PGXCNodeHandle *)); i = 0; foreach(nodeitem, nodelist) copy_connections[lfirst_int(nodeitem) - 1] = connections[i++]; @@ -2363,7 +2369,7 @@ DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot, bool is_ /* Check status of connections */ /* We want to track new "write" nodes, and new nodes in the current transaction * whether or not they are write nodes. */ - if (write_node_count < NumDataNodes) + if (write_node_count < node_num) { for (i = 0; i < conn_count; i++) { @@ -2710,15 +2716,16 @@ DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_data_node, { int i; int nLen = htonl(4); + const int node_num = IsPGXCMirrorMode ? PGXCMirror_GetMirrorTotalCount() : NumDataNodes; RemoteQueryState *combiner = NULL; bool need_tran; bool error = false; struct timeval *timeout = NULL; /* wait forever */ - PGXCNodeHandle *connections[NumDataNodes]; + PGXCNodeHandle *connections[node_num]; PGXCNodeHandle *primary_handle = NULL; int conn_count = 0; - for (i = 0; i < NumDataNodes; i++) + for (i = 0; i < node_num; i++) { PGXCNodeHandle *handle = copy_connections[i]; @@ -3020,7 +3027,12 @@ get_exec_connections(RemoteQueryState *planstate, /* The slot should be of type DataRow */ Assert(!TupIsNull(slot) && slot->tts_dataRow); - nodelist = list_make1_int(slot->tts_dataNode); + if (IsPGXCMirrorMode) + nodelist = list_concat(nodelist, + PGXCMirror_GetSubsetMirrors(slot->tts_dataNode, false)); + else + nodelist = list_make1_int(slot->tts_dataNode); + primarynode = NIL; } else @@ -3064,10 +3076,8 @@ get_exec_connections(RemoteQueryState *planstate, if (list_length(nodelist) == 0 && (exec_type == EXEC_ON_ALL_NODES || exec_type == EXEC_ON_DATANODES)) - { /* Primary connection is included in this number of connections if it exists */ - dn_conn_count = NumDataNodes; - } + dn_conn_count = IsPGXCMirrorMode ? PGXCMirror_GetMirrorTotalCount() : NumDataNodes; else { if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES) @@ -4398,6 +4408,7 @@ static PGXCNodeAllHandles * pgxc_get_all_transaction_nodes(PGXCNode_HandleRequested status_requested) { PGXCNodeAllHandles *pgxc_connections; + int node_num = IsPGXCMirrorMode ? PGXCMirror_GetMirrorTotalCount() : NumDataNodes; pgxc_connections = (PGXCNodeAllHandles *) palloc0(sizeof(PGXCNodeAllHandles)); if (!pgxc_connections) @@ -4408,7 +4419,7 @@ pgxc_get_all_transaction_nodes(PGXCNode_HandleRequested status_requested) } pgxc_connections->datanode_handles = (PGXCNodeHandle **) - palloc(NumDataNodes * sizeof(PGXCNodeHandle *)); + palloc(node_num * sizeof(PGXCNodeHandle *)); pgxc_connections->coord_handles = (PGXCNodeHandle **) palloc(NumCoords * sizeof(PGXCNodeHandle *)); if (!pgxc_connections->datanode_handles || !pgxc_connections->coord_handles) @@ -4481,7 +4492,7 @@ ExecCloseRemoteStatement(const char *stmt_name, List *nodelist) connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL; ereport(WARNING, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to close data node statemrnt"))); + errmsg("Failed to close data node statement"))); } if (pgxc_node_send_sync(connections[i]) != 0) { diff --git a/src/backend/pgxc/pool/mirror.c b/src/backend/pgxc/pool/mirror.c new file mode 100644 index 0000000..1700695 --- /dev/null +++ b/src/backend/pgxc/pool/mirror.c @@ -0,0 +1,1034 @@ +/*------------------------------------------------------------------------- + * + * mirror.c + * + * File containing API to interact with Fault Sync module + * It is necessary to activate the GUC parameter mirror_mode + * to call the APIs of this file. + * Only this file is authorized to call APIs of Fault Sync + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "utils/memutils.h" +#include "pgxc/poolmgr.h" +#include "pgxc/mirror.h" +#include "pgxc/locator.h" +#include "utils/builtins.h" +#include "pgxc/xcm/node_membership.h" +#include "nodes/pg_list.h" +#include "access/gtm.h" + +/* This include should contain all the APIs this file needs */ + +/* + * Get list of Datanode mirrors + * We need to get a complete array of Datanodes (How many mirrors for each Datanode) + * and save it as a global value. + * This has to be done at Node startup. + */ + +/* List of guc params for mirroring mode */ +bool IsPGXCMirrorMode = false; /* Activate Datanode Mirroring */ +bool IsXCM = false; /* Activate Cluster manager module */ + +/* Pooler (Coordinator) GUC parameters */ +int PreferredMirrorId = 0; /* Preferred Mirror ID, coupled with PreferredDataNode */ +int PrimaryMirrorId = 1; /* Primary Mirror ID for Replicated Handling */ +char *MirrorCount = NULL; /* Number of Mirrors for each Datanode */ + +/* Datanode GUC Parameters */ +bool IsPrimaryMirror = true; /* Is Datanode a Primary Mirror or not (for registration on GTM) */ +int PGXCMirrorId = 1; /* Defines mirror ID of Local Node (for Datanode only) */ + +/* + * Contains the number of mirrors for each node + * 1 means that there is only 1 Datanode, like in normal mode + */ +/* Number of Mirrors for each Datanode */ +static int *PGXCNodeMirrorCount; +/* Total number of mirrors */ +static int MirrorTotalCount = 0; + +/* + * Get list of connection parameters and save them as global values. + * It is important to respect the format used by Pooler. + * For Datanode, bring back the GTM connection info only. + * A Datanode has also to know if it is itself a mirror or not, + * we need to know if this node needs to register (only primary are allowed to). + * + * if node is a datanode mirror, it needs to know its mirror number. + * its Datanode id can be get from pgxc_node_id. + * + * For Coordinator, bring back the mirror count for each Datanode + * Connection parameters. + * Connection parameters to Coordinators, to Datanode, to GTM. + * Prepare it in a nice shape and save them as global values. + * In case an error occurs at this step, node cannot startup. + */ + +/* + * Note: users list and password list is still managed by postgresql.conf + */ + +/* PGXCMirror_GetXXX and PGXCMirror_SetXXX APIs can be invocated only at node startup */ + +/* + * PGXCMirror_SetNodePrimary + * + * Set PGXCNodePrimary parameters for Replicated Handling + * This is called at node startup + */ +void +PGXCMirror_SetNodePrimary(void) +{ + /* + * Get the primary node parameters from Fault Sync module + * In other cases GUC params have all the necessary data. + */ + if (IsXCM) + { + int datanode_id, mirror_id; + + if (get_xcm_preferred_mirror(PGXCNodeId, &datanode_id, &mirror_id) < 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Fault Sync ERROR: could not get Primary node data"))); + + /* Assign obtained values */ + PreferredDataNode = datanode_id; + PreferredMirrorId = mirror_id; + } +} + +/* + * PGXCMirror_SetMirrorCountList + * + * Set PGXCNodeMirrorCount array with the list of primary node numbers + * This is called at node startup in TopMemoryContext + */ +int +PGXCMirror_SetMirrorCountList(void) +{ + MirrorTotalCount = 0; + + PGXCNodeMirrorCount = (int *) MemoryContextAlloc(TopMemoryContext, sizeof(int) * NumDataNodes); + if (!PGXCNodeMirrorCount) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* Get number of mirrors for each Datanode */ + if (IsXCM) + { + int i; + + /* Here cluster manager is activated */ + for (i = 0; i < NumDataNodes; i++) + { + int mirror_count; + + if (get_xcm_mirror_count(i + 1, &mirror_count) < 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Fault Sync ERROR: could not get Mirror count"))); + PGXCNodeMirrorCount[i] = mirror_count; + MirrorTotalCount += PGXCNodeMirrorCount[i]; + } + } + else + { + /* + * In this case we get the necessary info from GUC param. + * Same parsing policy as in pooler is used. + */ + char *rawstring; + List *elemlist; + ListCell *l; + int i, j; + + rawstring = pstrdup(MirrorCount); + + if (!SplitIdentifierString(rawstring, ',', &elemlist)) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax for \"mirror_count\""))); + } + + i = 0; + foreach(l, elemlist) + { + int curnum = pg_atoi((char *) lfirst(l), 4, 0); + + /* Ignore extra entries, if any */ + if (i >= NumDataNodes) + break; + + PGXCNodeMirrorCount[i] = curnum; + MirrorTotalCount += curnum; + i++; + } + + list_free(elemlist); + pfree(rawstring); + + /* Validate */ + if (i < NumDataNodes - 1) + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax for \"mirror_count\""))); + + /* if only 1 element, copy the first value to all the others */ + if (i == 1) + { + for (j = 1; j < NumDataNodes; j++) + PGXCNodeMirrorCount[i] = PGXCNodeMirrorCount[0]; + + MirrorTotalCount = PGXCNodeMirrorCount[0] * NumDataNodes; + } + } + + return MirrorTotalCount; +} + +/* + * PGXCMirror_GetMirrorTotalCount + * + * Return total number of Mirrors in the cluster + */ +int +PGXCMirror_GetMirrorTotalCount(void) +{ + return MirrorTotalCount; +} + +int +PGXCMirror_GetMirrorCount(int pgxc_node_id) +{ + Assert(pgxc_node_id > 0 && pgxc_node_id < NumDataNodes + 1); + return PGXCNodeMirrorCount[pgxc_node_id - 1]; +} + +/* Check mirror(s) status */ +bool +PGXCMirror_AreAllMirrorsOffline(int pgxc_node_id) +{ + Assert(pgxc_node_id > 0 && pgxc_node_id <= NumDataNodes); + + if (IsXCM) + { + int count; + int mirror_count = PGXCMirror_GetMirrorCount(pgxc_node_id); + + /* Check if all the mirrors are offline */ + for (count = 0; count < mirror_count;count++) + if (!XCM_IS_FAULT(PGXCMirror_CheckStatus(REMOTE_CONN_DATANODE, pgxc_node_id, count + 1))) + return false; /* Got 1 mirror online */ + + /* If we are here, no mirrors have been found online for this datanode :( */ + return true; + } + + /* + * There is now no implementation to determine if mirror status without XCM + * so let's think they are online. + * If it is not the case, Coordinator will return an error btw. + */ + return false; +} + +bool +PGXCMirror_IsMirrorOffline(int pgxc_node_id, int mirror_id) +{ + Assert(mirror_id <= PGXCMirror_GetMirrorCount(pgxc_node_id) && + pgxc_node_id <= NumDataNodes); + + /* We need an interface compatible with XCM in this case */ + if (IsXCM) + { + if (XCM_IS_FAULT(PGXCMirror_CheckStatus(REMOTE_CONN_DATANODE, pgxc_node_id, mirror_id))) + return true; /* It looks that it is offline */ + else + return false; + } + + /* + * In Mirror mode, there is no functionnality to check if a mirror is online or not + * so let's believe it is online by default + */ + return false; +} + +/* + * PGXCMirror_GetSubsetMirrors + * + * Return a subset of mirrors for given node ID depending on the type of operation + * in global array of Datanodes. + */ +List* +PGXCMirror_GetSubsetMirrors(int pgxc_node_id, bool is_write) +{ + List *list_mirrors = NIL; + int count; + int sum = 0; + int mirror_count = PGXCNodeMirrorCount[pgxc_node_id - 1]; + + /* Determine number of where Mirror subset is located in global Array */ + for (count = 0; count < pgxc_node_id - 1; count++) + sum += PGXCNodeMirrorCount[count]; + + if (PGXCMirror_AreAllMirrorsOffline(pgxc_node_id)) + ereport(LOG, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("All Mirrors are Offline for Node %d", pgxc_node_id))); + + if (is_write) + { + /* Select all the subset of online Mirrors */ + for (count = sum + 1; count < sum + mirror_count + 1; count++) + { + /* check if this mirror is online before associating it */ + if (IsXCM && + !XCM_IS_FAULT(PGXCMirror_CheckStatus(REMOTE_CONN_DATANODE, + pgxc_node_id, count - sum))) + list_mirrors = lappend_int(list_mirrors, count); + else if (!IsXCM) + list_mirrors = lappend_int(list_mirrors, count); + else + elog(WARNING, "Datanode %d mirror %d is in failed state", pgxc_node_id, count - sum); + } + } + else + { + /* + * Select only one node + * If a preferred node exists, choose it. + * If no preferred node, pick up one randomly. + * If the preferred node is not online, choose another mirror randomly + */ + bool done = false; + + Assert(PreferredMirrorId <= PGXCNodeMirrorCount[pgxc_node_id - 1]); + /* Check also that preferred node is online */ + if (PreferredMirrorId != 0) + { + if (IsXCM && + !XCM_IS_FAULT(PGXCMirror_CheckStatus(REMOTE_CONN_DATANODE, + pgxc_node_id, PreferredMirrorId))) + { + done = true; + list_mirrors = lappend_int(list_mirrors, sum + PreferredMirrorId); + } + else if (!IsXCM) + { + /* In mirror mode without XCM active, we suppose the mirror is online */ + done = true; + list_mirrors = lappend_int(list_mirrors, sum + PreferredMirrorId); + } + } + + while (!done) + { + int offset; + + srand(time(NULL)); + offset = rand() % mirror_count + 1; + if (!XCM_IS_FAULT(PGXCMirror_CheckStatus(REMOTE_CONN_DATANODE, pgxc_node_id, offset))) + { + list_mirrors = lappend_int(list_mirrors, sum + offset); + done = true; + } + } + } + + return list_mirrors; +} + +/* + * PGXCMirror_GetPrimaryMirrorNum + * + * Return Primary Mirror location in global array of Datanodes (pooler format). + */ +int +PGXCMirror_GetPrimaryMirrorNum(int primary_data_node) +{ + int count; + int sum = 0; + + /* Determine number of Primary Node in the Global Array of Datanodes */ + for (count = 0; count < primary_data_node - 1; count++) + sum += PGXCNodeMirrorCount[count]; + + return sum + PrimaryMirrorId; +} + +/* + * PGXCMirror_GetMirrorGlobalID + * + * Return position in pooler array of given mirror + */ +int +PGXCMirror_GetMirrorGlobalID(int pgxc_node_id, int mirror_id) +{ + int count; + int sum = 0; + + for (count = 0; count < pgxc_node_id - 1; count++) + sum += PGXCNodeMirrorCount[count]; + + return sum + mirror_id; +} + +/* + * PGXCMirror_GetMirrorGlobalID + * + * Return Datanode ID and Mirror ID for a given node ID of pooler array + */ +int +PGXCMirror_GetMirrorIDAndDatanodeID(int global_id, int *mirror_id) +{ + int datanode_id = 0; + int sum = 0; + + /* Determine datanode ID */ + while (datanode_id < NumDataNodes) + { + sum += PGXCNodeMirrorCount[datanode_id]; + datanode_id++; + if (global_id <= sum) + break; + } + + /* Determine Mirror ID */ + sum -= PGXCNodeMirrorCount[datanode_id - 1]; + *mirror_id = global_id - sum; + + return datanode_id; +} + +/* + * To respect Pooler Connection format, Host and Port strings respect the following format + * + * For example, in the case of a configuration with 2 Datanodes, each having 2 mirrors: + * Datanode(1,1),Datanode(1,2),Datanode(2,1),Datanode(2,2) + * + * With 3 Datanodes, having 1 mirror for the 1st, 3 mirrors for the 2nd, and 2 mirrors for the 3rd: + * Datanode(1,1),Datanode(2,1),Datanode(2,2),Datanode(2,3),Datanode(3,1),Datanode(3,2) + * + * This permits to keep a simple array format that is easily managed by pooler. + * Complementary data about primary numbers and number of mirrors for each Datanode + * is contained in additional arrays locally saved here. + */ + +/* + * PGXCMirror_GetHostString + * + * Called at pooler Initialization to get + * the Datanode or Coordinator string at Pooler initialization. + * This is used to replace CoordHosts and DataNodeHosts in pooler + */ +char* +PGXCMirror_GetHostTotalString(RemoteConnTypes conn_type) +{ + char *host_str = NULL; + int i, num_nodes; + + Assert(IS_PGXC_COORDINATOR && IsXCM); + Assert(conn_type == REMOTE_CONN_COORD || conn_type == REMOTE_CONN_DATANODE); + + if (conn_type == REMOTE_CONN_COORD) + num_nodes = NumCoords; + else if (conn_type == REMOTE_CONN_DATANODE) + num_nodes = NumDataNodes; + + for (i = 0; i < num_nodes; i++) + { + if (conn_type == REMOTE_CONN_COORD) + { + int local_len; + char *buf; + + buf = PGXCMirror_GetCoordHost(i + 1); + local_len = strlen(buf); + + if (!host_str) + { + host_str = (char *) palloc(local_len + 1); + sprintf(host_str, "%s", buf); + } + else + { + host_str = (char *) repalloc(host_str, + strlen(host_str) + local_len + 2); + sprintf(host_str, "%s,%s", host_str, buf); + } + pfree(buf); + } + else if (conn_type == REMOTE_CONN_DATANODE) + { + int count; + + /* Build Datanode Host string */ + Assert(PGXCNodeMirrorCount[i] > 0); + + for (count = 0; count < PGXCNodeMirrorCount[i]; count++) + { + int local_len; + char *buf; + + buf = PGXCMirror_GetDataNodeHost(i + 1, count + 1); + local_len = strlen(buf); + + /* OK, got it. Build the string. */ + if (!host_str) + { + host_str = (char *) palloc(local_len + 1); + sprintf(host_str, "%s", buf); + } + else + { + host_str = (char *) repalloc(host_str, + strlen(host_str) + local_len + 2); + sprintf(host_str, "%s,%s", host_str, buf); + } + pfree(buf); + } + } + } + + return host_str; +} + +/* + * PGXCMirror_GetPortTotalString + * + * Called at pooler Initialization to get + * the Datanode string at Pooler initialization. + * This is used to replace DataNodePorts and CoordPorts + */ +char* +PGXCMirror_GetPortTotalString(RemoteConnTypes conn_type) +{ + char *port_str = NULL; + int i, num_nodes; + + Assert(IS_PGXC_COORDINATOR && IsXCM); + Assert(conn_type == REMOTE_CONN_COORD || conn_type == REMOTE_CONN_DATANODE); + + if (conn_type == REMOTE_CONN_COORD) + num_nodes = NumCoords; + else if (conn_type == REMOTE_CONN_DATANODE) + num_nodes = NumDataNodes; + + for (i = 0; i < num_nodes; i++) + { + if (conn_type == REMOTE_CONN_COORD) + { + int local_len; + char *buf; + + buf = PGXCMirror_GetCoordPort(i + 1); + local_len = strlen(buf); + + /* OK, got it. Build the string. */ + if (!port_str) + { + port_str = (char *) palloc(local_len + 1); + sprintf(port_str, "%s", buf); + } + else + { + port_str = (char *) repalloc(port_str, + strlen(port_str) + local_len + 2); + sprintf(port_str, "%s,%s", port_str, buf); + } + pfree(buf); + } + else if (conn_type == REMOTE_CONN_DATANODE) + { + int count; + + /* Build Datanode Host string */ + Assert(PGXCNodeMirrorCount[i] > 0); + + for (count = 0; count < PGXCNodeMirrorCount[i]; count++) + { + int local_len; + char *buf; + + buf = PGXCMirror_GetDataNodePort(i + 1, count + 1); + local_len = strlen(buf); + + /* OK, got it. Build the string. */ + if (!port_str) + { + port_str = (char *) palloc(local_len + 1); + sprintf(port_str, "%s", buf); + } + else + { + port_str = (char *) repalloc(port_str, + strlen(port_str) + local_len + 2); + sprintf(port_str, "%s,%s", port_str, buf); + } + pfree(buf); + } + } + } + + return port_str; +} + +/* + * PGXCMirror_GetGTMHost + * + * Get GTM Host value for connection to GTM + * This is done for local node only + * This replaces GUC parameter GtmHost in Mirror mode + */ +char* +PGXCMirror_GetLocalGTMHost(void) +{ + int gtm_id = 0; + int err = 0; + + Assert(IS_PGXC_COORDINATOR || IS_PGXC_DATANODE); + + /* Get GTM Id local node is connected to */ + if (IS_PGXC_COORDINATOR) + err = find_xcm_coordinator_gtm_pxy(PGXCNodeId, >m_id); + else if (IS_PGXC_DATANODE) + err = find_xcm_mirror_gtm_pxy(PGXCNodeId, PGXCMirrorId, >m_id); + else + return NULL; + + if (err < 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Fault Sync ERROR: could not get GTM information"))); + + /* Need Connection points for GTM or Proxy */ + if (gtm_id > 0) + return PGXCMirror_GetGTMProxyHost(gtm_id); + else if (gtm_id == 0) + return PGXCMirror_GetGTMHost(); + else + return NULL; +} + +/* + * PGXCMirror_GetLocalGTMPort + * + * Get GTM Port value for connection to GTM + * This is done for local node only + * This replaces GUC parameter GtmPort in Mirror mode + */ +int +PGXCMirror_GetLocalGTMPort(void) +{ + int gtm_id = 0; + int err = 0; + + Assert(IS_PGXC_COORDINATOR || IS_PGXC_DATANODE); + + /* Get GTM Id local node is connected to */ + if (IS_PGXC_COORDINATOR) + err = find_xcm_coordinator_gtm_pxy(PGXCNodeId, >m_id); + else if (IS_PGXC_DATANODE) + err = find_xcm_mirror_gtm_pxy(PGXCNodeId, PGXCMirrorId, >m_id); + else + return NULL; + + if (err < 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Fault Sync ERROR: could not get GTM information"))); + + /* Need Connection points for GTM or Proxy */ + if (gtm_id > 0) + return pg_atoi(PGXCMirror_GetGTMProxyPort(gtm_id), 4, 0); + else + return pg_atoi(PGXCMirror_GetGTMPort(), 4, 0); +} + +/* + * PGXCMirror_ChangeFlag + * + * Set Flag of given Node to the status wanted + */ +int +PGXCMirror_ReportFail(RemoteConnTypes conn_type, int pgxc_node_id, int mirror_id) +{ + int err = 0; + + Assert(conn_type == REMOTE_CONN_COORD || + conn_type == REMOTE_CONN_DATANODE || + conn_type == REMOTE_CONN_GTM || + conn_type == REMOTE_CONN_GTM_PROXY); + + switch(conn_type) + { + case REMOTE_CONN_COORD: + err = report_xcm_coordinator_failure(pgxc_node_id); + break; + + case REMOTE_CONN_DATANODE: + err = report_xcm_mirror_failure(pgxc_node_id, mirror_id); + break; + + case REMOTE_CONN_GTM: + err = report_xcm_gtm_failure(pgxc_node_id); + break; + + case REMOTE_CONN_GTM_PROXY: + err = report_xcm_gtm_pxy_failure(pgxc_node_id); + break; + + default: + err = -1; + } + + return err; +} + +/* Series of functions to report failures for each component type */ +void +PGXCMirror_ReportCoordFail(int pgxc_node_id) +{ + Assert(pgxc_node_id > 0 && pgxc_node_id <= NumCoords); + + if (PGXCMirror_ReportFail(REMOTE_CONN_COORD, pgxc_node_id, 0) < 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Fault Sync ERROR: could not report failed Coordinator"))); +} + +void +PGXCMirror_ReportDataNodeFail(int pgxc_node_id, int mirror_id) +{ + Assert(pgxc_node_id > 0 && pgxc_node_id <= NumDataNodes); + Assert(PGXCNodeMirrorCount[pgxc_node_id - 1] >= pgxc_node_id); + + if (PGXCMirror_ReportFail(REMOTE_CONN_DATANODE, pgxc_node_id, mirror_id) < 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Fault Sync ERROR: could not report failed Datanode"))); +} + +/* + * A node doesn't know if it is connected to a proxy or a gtm, + * so report the failure for the component connected to this node. + */ +void +PGXCMirror_ReportGTMFail(void) +{ + int err = 0; + int gtm_id = 0; + + if (IS_PGXC_COORDINATOR) + err = find_xcm_coordinator_gtm_pxy(PGXCNodeId, >m_id); + else if (IS_PGXC_DATANODE) + err = find_xcm_mirror_gtm_pxy(PGXCNodeId, PGXCMirrorId, >m_id); + else + err = -1; + + if (err < 0) + goto report_error; + + if (gtm_id > 0) + err = PGXCMirror_ReportFail(REMOTE_CONN_GTM_PROXY, gtm_id, 0); + else if (gtm_id == 0) + err = PGXCMirror_ReportFail(REMOTE_CONN_GTM, 0, 0); + else + err = -1; + +report_error: + if (err < 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Fault Sync ERROR: could not report failed GTM"))); +} + +/* + * PGXCMirror_CheckStatus + * + * Check if Coordinator or datanodes have failed or not. + * It is necessary in order not to take connections to components that crashed + * have a failed status. + */ +uint32 +PGXCMirror_CheckStatus(RemoteConnTypes conn_type, int pgxc_node_id, int mirror_id) +{ + int err = 0; + uint32 status = 0; + + Assert(conn_type == REMOTE_CONN_COORD || + conn_type == REMOTE_CONN_DATANODE); + + switch(conn_type) + { + case REMOTE_CONN_COORD: + err = get_xcm_coordinator_status(pgxc_node_id, &status); + break; + + case REMOTE_CONN_DATANODE: + err = get_xcm_mirror_status(pgxc_node_id, mirror_id, &status); + break; + + default: + err = -1; + } + + if (err < 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Fault Sync ERROR: could not get Node Status"))); + + return status; +} + +/* Series of functions to check status for each component type */ +uint32 +PGXCMirror_CheckCoordStatus(int pgxc_node_id) +{ + return PGXCMirror_CheckStatus(REMOTE_CONN_COORD, pgxc_node_id, 0); +} + +uint32 +PGXCMirror_CheckDataNodeStatus(int pgxc_node_id, int mirror_id) +{ + return PGXCMirror_CheckStatus(REMOTE_CONN_DATANODE, pgxc_node_id, 0); +} + +/* + * PGXCMirror_GetNodeHost + * + * Get Node host name for given node + */ +char* +PGXCMirror_GetNodeHost(RemoteConnTypes conn_type, int pgxc_node_id, int mirror_id) +{ + char *host_str = NULL; + int n_connections, conn_pt_num, local_len; + xcm_connPoint *conn_pts; + + Assert(conn_type == REMOTE_CONN_COORD || + conn_type == REMOTE_CONN_DATANODE || + conn_type == R... [truncated message content] |