diff options
author | Pavan Deolasee | 2014-09-01 13:07:30 +0000 |
---|---|---|
committer | Pavan Deolasee | 2014-09-01 13:07:30 +0000 |
commit | 8642a0b6269c9d7212a968441266ebc64e90ded8 (patch) | |
tree | 1fdea16f55446329c613431af6c4e836a9348afb | |
parent | fd159b3983473599768ca36ed8e4b8bfa1ed1969 (diff) |
The Postgres-XL functionality includes MPP parallelism with
data node to data node communication, more stringent security,
and other performance enhancements. Please see release notes.
Key contributors are:
Andrei Martsinchyk
Nikhil Sontakke
Mason Sharp
271 files changed, 42562 insertions, 3745 deletions
diff --git a/.gitignore b/.gitignore index 1e15ce5fc1..689ac5bee9 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ lcov.info win32ver.rc *.exe lib*dll.def +*~ # Local excludes in root directory /GNUmakefile @@ -30,3 +31,7 @@ lib*dll.def /pgsql.sln.cache /Debug/ /Release/ +/StormDB* +/cscope* +/.gitignore + @@ -1,47 +1,381 @@ -Postgres-XC Cluster Database Management System +Postgres-XL Cluster Database Management System -Portions Copyright (c) 2010-2012, Postgres-XC Development Group -Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group +Portions Copyright (c) 2012-2014, TransLattice, Inc. +Portions Copyright (c) 2010-2013, Postgres-XC Development Group +Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group Portions Copyright (c) 1994, The Regents of the University of California -Permission to use, copy, modify, and distribute this software and its -documentation for any purpose, without fee, and without a written agreement -is hereby granted, provided that the above copyright notice and this -paragraph and the following two paragraphs appear in all copies. - -IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR -DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING -LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS -DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, -INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY -AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS -ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO -PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -IN NO EVENT SHALL POSTGRESQL GLOBAL DEVELOPMENT GROUP BE LIABLE TO ANY -PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL -DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS -SOFTWARE AND ITS DOCUMENTATION, EVEN IF POSTGRESQL GLOBAL DEVELOPMENT -GROUP HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -POSTGRESQL GLOBAL DEVELOPMENT GROUP SPECIFICALLY DISCLAIMS ANY WARRANTIES, -INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY -AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS -ON AN "AS IS" BASIS, AND THE POSTGRESQL GLOBAL DEVELOPMENT GROUP HAS NO OBLIGATIONS TO -PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -IN NO EVENT SHALL POSTGRES-XC DEVELOPMENT GROUP BE LIABLE TO ANY -PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL -DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS -SOFTWARE AND ITS DOCUMENTATION, EVEN IF POSTGRES-XC DEVELOPMENT -GROUP HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -POSTGRES-XC DEVELOPMENT GROUP SPECIFICALLY DISCLAIMS ANY WARRANTIES, -INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY -AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS -ON AN "AS IS" BASIS, AND THE POSTGRES-XC DEVELOPMENT GROUP HAS NO OBLIGATIONS TO -PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000000..14e2f777f6 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. @@ -1,32 +1,30 @@ -Postgres-XC Database Management System -===================================== +Postgres-XL Database Management System +====================================== -This directory contains the source code distribution of the Postgres-XC +This directory contains the source code distribution of the Postgres-XL database management system. -Postgres-XC is an advanced object-relational cluster database management +Postgres-XL is an advanced object-relational cluster database management system that supports an extended subset of the SQL standard, including transactions, foreign keys, user-defined types and functions. This distribution also contains C language bindings. -Postgres-XC has many language interfaces similar to PostgreSQL, many of +Postgres-XL has many language interfaces similar to PostgreSQL, many of which are listed here: http://www.postgresql.org/download See the file INSTALL for instructions on how to build and install -Postgres-XC. That file also lists supported operating systems and +Postgres-XL. That file also lists supported operating systems and hardware platforms and contains information regarding any other -software packages that are required to build or run the Postgres-XC -system. Changes between all Postgres-XC releases are recorded in the +software packages that are required to build or run the Postgres-XL +system. Changes between all Postgres-XL releases are recorded in the file HISTORY. Copyright and license information can be found in the file COPYRIGHT. A comprehensive documentation set is included in this distribution; it can be read as described in the installation instructions. The latest version of this software may be obtained at -http://sourceforge.net/projects/postgres-xc/. For more information look at our -web site located at http://postgres-xc.sourceforge.net/. +http://sourceforge.net/projects/postgres-xl/. For more information look at our +web site located at http://postgres-xl.sourceforge.net/. -More information about Postgres-XC Development Group is available at -http://sourceforge.net/apps/mediawiki/postgres-xc/index.php?title=Charter. @@ -1,8 +1,8 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.63 for Postgres-XC 1.1devel. +# Generated by GNU Autoconf 2.63 for PostgreSQL 9.2beta2 (Postgres-XL 9.2.0). # -# Report bugs to <[email protected]>. +# Report bugs to <[email protected]>. # # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, # 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. @@ -31,6 +31,8 @@ esac fi + + # PATH needs CR # Avoid depending upon Character Ranges. as_cr_letters='abcdefghijklmnopqrstuvwxyz' @@ -594,14 +596,12 @@ MAKEFLAGS= SHELL=${CONFIG_SHELL-/bin/sh} # Identity of this package. -PACKAGE_NAME='Postgres-XC' -PACKAGE_TARNAME='postgres-xc' -# Package is based on former PostgreSQL, so base package version on that -PACKAGE_VERSION='9.2beta2' -# Postgres-XC 1.1devel is based on PostgreSQL 9.1beta2 -PACKAGE_XC_VERSION='1.1devel' -PACKAGE_STRING='Postgres-XC 1.1devel' -PACKAGE_BUGREPORT='[email protected]' +PACKAGE_NAME='PostgreSQL' +PACKAGE_TARNAME='postgresql' +PACKAGE_VERSION='9.2beta2 (Postgres-XL 9.2.0)' +PACKAGE_XC_VERSION='9.2.0' +PACKAGE_STRING='PostgreSQL 9.2beta2 (Postgres-XL 9.2.0)' +PACKAGE_BUGREPORT='[email protected]' ac_unique_file="src/backend/access/common/heaptuple.c" ac_default_prefix=/usr/local/pgsql @@ -759,7 +759,6 @@ build_vendor build_cpu build PG_MAJORVERSION -PGXC_MAJORVERSION configure_args target_alias host_alias @@ -1412,7 +1411,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures Postgres-XC 1.1devel to adapt to many kinds of systems. +\`configure' configures PostgreSQL 9.2beta2 (Postgres-XL 9.2.0) to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1477,7 +1476,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of Postgres-XC 1.1devel:";; + short | recursive ) echo "Configuration of PostgreSQL 9.2beta2 (Postgres-XL 9.2.0):";; esac cat <<\_ACEOF @@ -1561,7 +1560,7 @@ Some influential environment variables: Use these variables to override the choices made by `configure' or to help it to find libraries and programs with nonstandard names/locations. -Report bugs to <[email protected]>. +Report bugs to <[email protected]>. _ACEOF ac_status=$? fi @@ -1624,7 +1623,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -Postgres-XC configure 1.1devel +PostgreSQL configure 9.2beta2 (Postgres-XL 9.2.0) generated by GNU Autoconf 2.63 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, @@ -1640,7 +1639,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by Postgres-XC $as_me 1.1devel, which was +It was created by PostgreSQL $as_me 9.2beta2 (Postgres-XL 9.2.0), which was generated by GNU Autoconf 2.63. Invocation command line was $ $0 $@ @@ -2069,12 +2068,6 @@ cat >>confdefs.h <<_ACEOF #define PGXC_VERSION "$PACKAGE_XC_VERSION" _ACEOF -PGXC_MAJORVERSION=`expr "$PACKAGE_XC_VERSION" : '\([0-9][0-9]*\.[0-9][0-9]*\)'` - -cat >>confdefs.h <<_ACEOF -#define PGXC_MAJORVERSION "$PGXC_MAJORVERSION" -_ACEOF - # Make sure we can run config.sub. $SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 || @@ -10249,9 +10242,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -10472,9 +10465,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -10691,9 +10684,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -10839,9 +10832,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -11004,9 +10997,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -11152,9 +11145,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -11324,9 +11317,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -11472,9 +11465,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -11620,9 +11613,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -11792,9 +11785,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -11940,9 +11933,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -12088,9 +12081,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -12244,9 +12237,9 @@ $as_echo "$as_me: WARNING: zlib.h: proceeding with the preprocessor's result" >& { $as_echo "$as_me:$LINENO: WARNING: zlib.h: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: zlib.h: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -12400,9 +12393,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -12548,9 +12541,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -12704,9 +12697,9 @@ $as_echo "$as_me: WARNING: krb5.h: proceeding with the preprocessor's result" >& { $as_echo "$as_me:$LINENO: WARNING: krb5.h: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: krb5.h: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -12848,9 +12841,9 @@ $as_echo "$as_me: WARNING: openssl/ssl.h: proceeding with the preprocessor's res { $as_echo "$as_me:$LINENO: WARNING: openssl/ssl.h: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: openssl/ssl.h: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -12989,9 +12982,9 @@ $as_echo "$as_me: WARNING: openssl/err.h: proceeding with the preprocessor's res { $as_echo "$as_me:$LINENO: WARNING: openssl/err.h: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: openssl/err.h: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -13139,9 +13132,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -13287,9 +13280,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -13443,9 +13436,9 @@ $as_echo "$as_me: WARNING: libxml/parser.h: proceeding with the preprocessor's r { $as_echo "$as_me:$LINENO: WARNING: libxml/parser.h: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: libxml/parser.h: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -13587,9 +13580,9 @@ $as_echo "$as_me: WARNING: libxslt/xslt.h: proceeding with the preprocessor's re { $as_echo "$as_me:$LINENO: WARNING: libxslt/xslt.h: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: libxslt/xslt.h: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -13738,9 +13731,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -13960,9 +13953,9 @@ $as_echo "$as_me: WARNING: dns_sd.h: proceeding with the preprocessor's result" { $as_echo "$as_me:$LINENO: WARNING: dns_sd.h: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: dns_sd.h: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -14111,9 +14104,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -14260,9 +14253,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -14423,7 +14416,7 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX ## ---------------------------------------- ## -## Report this to [email protected] ## +## Report this to [email protected] ## ## ---------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 @@ -23221,9 +23214,9 @@ $as_echo "$as_me: WARNING: pthread.h: proceeding with the preprocessor's result" { $as_echo "$as_me:$LINENO: WARNING: pthread.h: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: pthread.h: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -28568,9 +28561,9 @@ $as_echo "$as_me: WARNING: libintl.h: proceeding with the preprocessor's result" { $as_echo "$as_me:$LINENO: WARNING: libintl.h: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: libintl.h: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -28941,9 +28934,9 @@ $as_echo "$as_me: WARNING: tcl.h: proceeding with the preprocessor's result" >&2 { $as_echo "$as_me:$LINENO: WARNING: tcl.h: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: tcl.h: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -29089,9 +29082,9 @@ $as_echo "$as_me: WARNING: Python.h: proceeding with the preprocessor's result" { $as_echo "$as_me:$LINENO: WARNING: Python.h: in the future, the compiler will take precedence" >&5 $as_echo "$as_me: WARNING: Python.h: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX -## ---------------------------------------- ## -## Report this to [email protected] ## -## ---------------------------------------- ## +## --------------------------------------- ## +## Report this to [email protected] ## +## --------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; @@ -29743,9 +29736,9 @@ cat >>confdefs.h <<_ACEOF #define PG_VERSION_STR "PostgreSQL $PACKAGE_VERSION on $host, compiled by $cc_string, `expr $ac_cv_sizeof_void_p \* 8`-bit" _ACEOF -# Supply additional version name for Postgres-XC +# Supply additional version name for Postgres-XL cat >>confdefs.h <<_ACEOF -#define PGXC_VERSION_STR "Postgres-XC $PACKAGE_XC_VERSION on $host, based on PostgreSQL $PACKAGE_VERSION, compiled by $cc_string, `expr $ac_cv_sizeof_void_p \* 8`-bit" +#define PGXC_VERSION_STR "Postgres-XL $PACKAGE_XC_VERSION on $host, based on PostgreSQL $PACKAGE_VERSION, compiled by $cc_string, `expr $ac_cv_sizeof_void_p \* 8`-bit" _ACEOF # Supply a numeric version string for use by 3rd party add-ons @@ -29758,17 +29751,10 @@ cat >>confdefs.h <<_ACEOF #define PG_VERSION_NUM $PG_VERSION_NUM _ACEOF -# Supply a numeric version string specific for Postgres-XC -PGXC_VERSION_NUM="`echo "$PACKAGE_XC_VERSION" | sed 's/[A-Za-z].*$//' | -tr '.' ' ' | -$AWK '{printf "%d%02d%02d", $1, $2, (NF >= 3) ? $3 : 0}'`" - -cat >>confdefs.h <<_ACEOF -#define PGXC_VERSION_NUM $PGXC_VERSION_NUM -_ACEOF # For PGXC, set -DPGXC by default. This can be overriden with -UPGXC if the user sets it. -CFLAGS="-DPGXC $CFLAGS" +# For Postgres-XL, set both -DPGXC and -DXCP +CFLAGS="-DPGXC -DXCP $CFLAGS" # Begin output steps @@ -30240,7 +30226,7 @@ exec 6>&1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by Postgres-XC $as_me 1.1devel, which was +This file was extended by PostgreSQL $as_me 9.2beta2 (Postgres-XL 9.2.0), which was generated by GNU Autoconf 2.63. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -30307,7 +30293,7 @@ Report bugs to <[email protected]>." _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_version="\\ -Postgres-XC config.status 1.1devel +PostgreSQL config.status 9.2beta2 (Postgres-XL 9.2) configured by $0, generated by GNU Autoconf 2.63, with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\" @@ -30491,7 +30477,7 @@ $debug || if test -n "$CONFIG_FILES"; then -ac_cr='
' +ac_cr='' ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null` if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then ac_cs_awk_cr='\\r' diff --git a/configure.in b/configure.in index 6769f790dc..892cd09b23 100644 --- a/configure.in +++ b/configure.in @@ -17,7 +17,7 @@ dnl Read the Autoconf manual for details. dnl m4_pattern_forbid(^PGAC_)dnl to catch undefined macros -AC_INIT([Postgres-XC], [1.1devel], [[email protected]]) +AC_INIT([PostgreSQL], [9.2beta2 (Postgres-XL 9.2)], [[email protected]]) m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.63], [], [m4_fatal([Autoconf version 2.63 is required. Untested combinations of 'autoconf' and PostgreSQL versions are not diff --git a/contrib/Makefile b/contrib/Makefile index ad449ef10c..61f61e6ecd 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -51,7 +51,8 @@ SUBDIRS = \ test_parser \ tsearch2 \ unaccent \ - vacuumlo + vacuumlo \ + stormstats ifeq ($(with_openssl),yes) SUBDIRS += sslinfo diff --git a/contrib/pgxc_ctl/pgxc_ctl_bash_conf_part b/contrib/pgxc_ctl/pgxc_ctl_bash_conf_part new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/contrib/pgxc_ctl/pgxc_ctl_bash_conf_part diff --git a/contrib/stormstats/Makefile b/contrib/stormstats/Makefile new file mode 100644 index 0000000000..961489a501 --- /dev/null +++ b/contrib/stormstats/Makefile @@ -0,0 +1,15 @@ +MODULE_big = stormstats +OBJS = stormstats.o + +EXTENSION = stormstats +DATA = stormstats--1.0.sql stormstats--unpackaged--1.0.sql + +ifdef USE_PGXS +PGXS := $(shell pg_config --pgxs) +include $(PGXS) +else +subdir = contrib/stormstats +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/stormstats/stormstats--1.0.sql b/contrib/stormstats/stormstats--1.0.sql new file mode 100644 index 0000000000..2ea2b32a6e --- /dev/null +++ b/contrib/stormstats/stormstats--1.0.sql @@ -0,0 +1,17 @@ +CREATE FUNCTION storm_database_stats( + OUT datname text, + OUT conn_cnt int8, + OUT select_cnt int8, + OUT insert_cnt int8, + OUT update_cnt int8, + OUT delete_cnt int8, + OUT ddl_cnt int8 +) +RETURNS SETOF record +AS 'MODULE_PATHNAME' +LANGUAGE C; + +-- Register a view on the function for ease of use. +CREATE VIEW storm_database_stats AS + SELECT * FROM storm_database_stats(); + diff --git a/contrib/stormstats/stormstats--unpackaged--1.0.sql b/contrib/stormstats/stormstats--unpackaged--1.0.sql new file mode 100644 index 0000000000..df9f3a033d --- /dev/null +++ b/contrib/stormstats/stormstats--unpackaged--1.0.sql @@ -0,0 +1,5 @@ +/* contrib/stormstats/stormstats--unpackaged--1.0.sql */ + +ALTER EXTENSION stormstats ADD function storm_database_stats(); +ALTER EXTENSION stormstats ADD view storm_database_stats; + diff --git a/contrib/stormstats/stormstats.c b/contrib/stormstats/stormstats.c new file mode 100644 index 0000000000..3a32d7ede8 --- /dev/null +++ b/contrib/stormstats/stormstats.c @@ -0,0 +1,898 @@ +#include "postgres.h" + +#include <unistd.h> + +#include "catalog/pg_type.h" +#include "executor/spi.h" +#include "miscadmin.h" +#include "storage/ipc.h" +#include "storage/spin.h" +#include "access/hash.h" + +#include "tcop/utility.h" +#include "commands/dbcommands.h" +#include "utils/builtins.h" +#include "utils/syscache.h" +#include "utils/snapmgr.h" +#include "libpq/auth.h" +#include "optimizer/planner.h" +#include "nodes/makefuncs.h" +#include "funcapi.h" +#include "stormstats.h" + +#include "pgxc/pgxc.h" +#include "pgxc/pgxcnode.h" +#include "pgxc/planner.h" +#include "pgxc/execRemote.h" + +/* mark this dynamic library to be compatible with PG */ +PG_MODULE_MAGIC; + +/* Location of stats file */ +#define STORM_DUMP_FILE "global/storm.stat" + +/* This constant defines the magic number in the stats file header */ +static const uint32 STORM_FILE_HEADER = 0x20120229; + +#define STORM_STATS_COLS 7 + +typedef struct ssHashKey +{ + int dbname_len; + const char *dbname_ptr; +} ssHashKey; + +typedef struct EventCounters +{ + int64 conn_cnt; + int64 select_cnt; + int64 insert_cnt; + int64 update_cnt; + int64 delete_cnt; + int64 ddl_cnt; +} EventCounters; + +typedef struct StormStatsEntry +{ + ssHashKey key; /* hash key of entry - MUST BE FIRST */ + EventCounters counters; + slock_t mutex; + char dbname[1]; /* VARIABLE LENGTH ARRAY - MUST BE LAST */ + +} StormStatsEntry; + +/* Local hash table entry, no mutex needed */ +typedef struct LocalStatsEntry +{ + ssHashKey key; /* hash key of entry */ + EventCounters counters; + char dbname[NAMEDATALEN]; +} LocalStatsEntry; + +typedef struct StormSharedState +{ + LWLockId lock; +} StormSharedState; + +static bool sp_save; /* whether to save stats across shutdown */ + +extern PlannedStmt *planner_callback(Query *parse, int cursorOptions, ParamListInfo boundParams); +extern void auth_check(Port *port, int status); + +static void sp_shmem_startup(void); +static void sp_shmem_shutdown(int code, Datum arg); +static Size hash_memsize(void); + +static uint32 ss_hash_fn(const void *key, Size keysize); +static int ss_match_fn(const void *key1, const void *key2, Size keysize); +static void stats_store(const char *dbname, CmdType c, bool isConnEvent, bool isUtilEvent); + +static StormStatsEntry *alloc_event_entry(ssHashKey *key); + +/* Functions */ +Datum storm_database_stats(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(storm_database_stats); + +/* Shared Memory Objects */ +static HTAB *StatsEntryHash = NULL; +static StormSharedState *shared_state = NULL; + +/* Session level objects */ +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; + +static ClientAuthentication_hook_type original_client_auth_hook = NULL; + +static ProcessUtility_hook_type prev_ProcessUtility = NULL; + +static int max_tracked_dbs; + +static void +ProcessUtility_callback(Node *parsetree, + const char *queryString, + ParamListInfo params, + bool isTopLevel, + DestReceiver *dest, +#ifdef PGXC + bool sentToRemote, +#endif /* PGXC */ + char *completionTag) +{ + elog( DEBUG1, "STORMSTATS: using plugin." ); + + standard_ProcessUtility(parsetree, queryString, params, isTopLevel, dest, +#ifdef PGXC + sentToRemote, +#endif /* PGXC */ + completionTag); + + stats_store(get_database_name(MyDatabaseId), CMD_UNKNOWN, false, true); + + /* + * Check if it's a CREATE/DROP DATABASE command. Update entries in the + * shared hash table accordingly. + */ + switch (nodeTag(parsetree)) + { + case T_CreatedbStmt: + { + ssHashKey key; + StormStatsEntry *entry; + CreatedbStmt *stmt = (CreatedbStmt *)parsetree; + + /* Set up key for hashtable search */ + key.dbname_len = strlen(stmt->dbname); + key.dbname_ptr = stmt->dbname; + + /* + * Lookup the hash table entry with exclusive lock. We have to + * manipulate the entries immediately anyways.. + */ + LWLockAcquire(shared_state->lock, LW_EXCLUSIVE); + + entry = (StormStatsEntry *) hash_search(StatsEntryHash, &key, HASH_FIND, NULL); + + /* What do we do if we find an entry already? We WARN for now */ + if (!entry) + entry = alloc_event_entry(&key); + else + ereport(WARNING, + (errmsg("entry exists already for database %s!", + entry->dbname))); + LWLockRelease(shared_state->lock); + break; + } + case T_DropdbStmt: + { + ssHashKey key; + StormStatsEntry *entry; + DropdbStmt *stmt = (DropdbStmt *)parsetree; + + /* Set up key for hashtable search */ + key.dbname_len = strlen(stmt->dbname); + key.dbname_ptr = stmt->dbname; + + /* + * Lookup the hash table entry with exclusive lock. We have to + * manipulate the entries immediately anyways.. + */ + LWLockAcquire(shared_state->lock, LW_EXCLUSIVE); + + entry = (StormStatsEntry *) hash_search(StatsEntryHash, &key, HASH_REMOVE, NULL); + + /* What do we do if we do not find an entry? We WARN for now */ + if (!entry && !stmt->missing_ok) + ereport(WARNING, + (errmsg("entry does not exist for database %s!", + stmt->dbname))); + LWLockRelease(shared_state->lock); + break; + } + default: + /* Nothing */; + } +} + +void +_PG_init(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + DefineCustomIntVariable("storm_stats.max_tracked_databases", + "Sets the maximum number of databases tracked.", + NULL, + &max_tracked_dbs, + 1000, + 1, + INT_MAX, + PGC_POSTMASTER, + 0, + NULL, + NULL, + NULL); + + DefineCustomBoolVariable("storm_stats.save", + "Save statistics across server shutdowns.", + NULL, + &sp_save, + true, + PGC_SIGHUP, + 0, + NULL, + NULL, + NULL); + + EmitWarningsOnPlaceholders("storm_stats"); + + RequestAddinShmemSpace(hash_memsize()); + RequestAddinLWLocks(1); + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = sp_shmem_startup; + planner_hook = planner_callback; + + original_client_auth_hook = ClientAuthentication_hook; + ClientAuthentication_hook = auth_check; + + prev_ProcessUtility = ProcessUtility_hook; + ProcessUtility_hook = ProcessUtility_callback; + + elog( DEBUG1, "STORMSTATS: plugin loaded" ); +} + +void +_PG_fini(void) +{ + shmem_startup_hook = prev_shmem_startup_hook; + planner_hook = NULL; + ProcessUtility_hook = prev_ProcessUtility; + + elog( DEBUG1, "STORMSTATS: plugin unloaded." ); +} + +static void sp_shmem_startup(void) +{ + HASHCTL event_ctl; + bool found; + FILE *file; + uint32 header; + int32 num; + int32 i; + int buffer_size; + char *buffer = NULL; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + /* + * Create or attach to the shared memory state, including hash table + */ + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + + shared_state = ShmemInitStruct("storm_stats state", sizeof(StormSharedState), &found); + if (!shared_state) + elog(ERROR, "out of shared memory"); + + if (!found) + { + shared_state->lock = LWLockAssign(); + } + + memset(&event_ctl, 0, sizeof(event_ctl)); + + event_ctl.keysize = sizeof(ssHashKey); + event_ctl.entrysize = sizeof(StormStatsEntry) + NAMEDATALEN; + event_ctl.hash = ss_hash_fn; + event_ctl.match = ss_match_fn; + + StatsEntryHash = ShmemInitHash("storm_stats event hash", max_tracked_dbs, + max_tracked_dbs, &event_ctl, + HASH_ELEM | HASH_FUNCTION | HASH_COMPARE); + if (!StatsEntryHash) + elog(ERROR, "out of shared memory"); + + LWLockRelease(AddinShmemInitLock); + + /* + * If we're in the postmaster (or a standalone backend...), set up a shmem + * exit hook to dump the statistics to disk. + */ + if (!IsUnderPostmaster) + on_shmem_exit(sp_shmem_shutdown, (Datum) 0); + + /* + * Attempt to load old statistics from the dump file, if this is the first + * time through and we weren't told not to. + */ + if (found || !sp_save) + return; + + /* + * Note: we don't bother with locks here, because there should be no other + * processes running when this code is reached. + */ + file = AllocateFile(STORM_DUMP_FILE, PG_BINARY_R); + if (file == NULL) + { + if (errno == ENOENT) + return; /* ignore not-found error */ + goto error; + } + + buffer_size = NAMEDATALEN; + buffer = (char *) palloc(buffer_size); + + if (fread(&header, sizeof(uint32), 1, file) != 1 || + header != STORM_FILE_HEADER || + fread(&num, sizeof(int32), 1, file) != 1) + goto error; + + for (i = 0; i < num; i++) + { + StormStatsEntry temp; + StormStatsEntry *entry; + + if (fread(&temp, offsetof(StormStatsEntry, mutex), 1, file) != 1) + goto error; + + if (temp.key.dbname_len >= buffer_size) + { + buffer = (char *) repalloc(buffer, temp.key.dbname_len + 1); + buffer_size = temp.key.dbname_len + 1; + } + + if (fread(buffer, 1, temp.key.dbname_len, file) != temp.key.dbname_len) + goto error; + buffer[temp.key.dbname_len] = '\0'; + + temp.key.dbname_ptr = buffer; + + /* make the hashtable entry (discards old entries if too many) */ + entry = alloc_event_entry(&temp.key); + + /* copy in the actual stats */ + entry->counters = temp.counters; + } + + pfree(buffer); + FreeFile(file); + return; + +error: + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not read stormstats file \"%s\": %m", + STORM_DUMP_FILE))); + if (buffer) + pfree(buffer); + if (file) + FreeFile(file); + /* If possible, throw away the bogus file; ignore any error */ + unlink(STORM_DUMP_FILE); +} + +/* + * shmem_shutdown hook: Dump statistics into file. + * + * Note: we don't bother with acquiring lock, because there should be no + * other processes running when this is called. + */ +static void +sp_shmem_shutdown(int code, Datum arg) +{ + FILE *file; + HASH_SEQ_STATUS hash_seq; + int32 num_entries; + StormStatsEntry *entry; + + /* Don't try to dump during a crash. */ + if (code) + return; + + /* Safety check ... shouldn't get here unless shmem is set up. */ + if (!shared_state || !StatsEntryHash) + return; + + /* Don't dump if told not to. */ + if (!sp_save) + return; + + file = AllocateFile(STORM_DUMP_FILE, PG_BINARY_W); + if (file == NULL) + goto error; + + if (fwrite(&STORM_FILE_HEADER, sizeof(uint32), 1, file) != 1) + goto error; + num_entries = hash_get_num_entries(StatsEntryHash); + if (fwrite(&num_entries, sizeof(int32), 1, file) != 1) + goto error; + + hash_seq_init(&hash_seq, StatsEntryHash); + while ((entry = hash_seq_search(&hash_seq)) != NULL) + { + int len = entry->key.dbname_len; + + if (fwrite(entry, offsetof(StormStatsEntry, mutex), 1, file) != 1 || + fwrite(entry->dbname, 1, len, file) != len) + goto error; + } + + if (FreeFile(file)) + { + file = NULL; + goto error; + } + + return; + +error: + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write stormstats file \"%s\": %m", + STORM_DUMP_FILE))); + + if (file) + FreeFile(file); + unlink(STORM_DUMP_FILE); +} + +PlannedStmt *planner_callback(Query *parse, int cursorOptions, ParamListInfo boundParams) +{ + PlannedStmt *plan; + + elog( DEBUG1, "STORMSTATS: using plugin." ); + + /* Generate a plan */ + plan = standard_planner(parse, cursorOptions, boundParams); + + stats_store(get_database_name(MyDatabaseId), parse->commandType, false, false); + + return plan; +} + +void auth_check(Port *port, int status) +{ + elog( DEBUG1, "STORMSTATS: using plugin." ); + + /* + * Any other plugins which use ClientAuthentication_hook. + */ + if (original_client_auth_hook) + original_client_auth_hook(port, status); + + if (status == STATUS_OK) + { + stats_store(port->database_name, CMD_UNKNOWN, true, false); + } +} + +static Size hash_memsize(void) +{ + Size size; + Size events_size; + Size state_size; + + events_size = hash_estimate_size(max_tracked_dbs, MAXALIGN(sizeof(StormStatsEntry))); + state_size = MAXALIGN(sizeof(StormSharedState)); + + size = add_size(events_size, state_size); + + return size; +} + +static StormStatsEntry *alloc_event_entry(ssHashKey *key) +{ + StormStatsEntry *entry; + bool found; + + if (hash_get_num_entries(StatsEntryHash) >= max_tracked_dbs) + { + elog(ERROR, "STORMSTATS: The maximum number of tracked databases have been reached"); + return NULL; + } + + /* Find or create an entry with desired hash code */ + entry = (StormStatsEntry *) hash_search(StatsEntryHash, key, HASH_ENTER, &found); + + if (!found) + { + entry->key.dbname_ptr = entry->dbname; + memset(&entry->counters, 0, sizeof(EventCounters)); + SpinLockInit(&entry->mutex); + + memcpy(entry->dbname, key->dbname_ptr, key->dbname_len); + entry->dbname[key->dbname_len] = '\0'; + } + + return entry; +} + +/* + * Calculate hash value for a key + */ +static uint32 +ss_hash_fn(const void *key, Size keysize) +{ + const ssHashKey *k = (const ssHashKey *) key; + + /* we don't bother to include encoding in the hash */ + return DatumGetUInt32(hash_any((const unsigned char *) k->dbname_ptr, + k->dbname_len)); +} + +/* + * Compare two keys - zero means match + */ +static int +ss_match_fn(const void *key1, const void *key2, Size keysize) +{ + const ssHashKey *k1 = (const ssHashKey *) key1; + const ssHashKey *k2 = (const ssHashKey *) key2; + + if (k1->dbname_len == k2->dbname_len && + memcmp(k1->dbname_ptr, k2->dbname_ptr, k1->dbname_len) == 0) + return 0; + else + return 1; +} + +static void +stats_store(const char *dbname, CmdType c, bool isConnEvent, bool isUtilEvent) +{ + ssHashKey key; + StormStatsEntry *entry; + + if (!shared_state || !StatsEntryHash) + return; + + /* Set up key for hashtable search */ + key.dbname_len = strlen(dbname); + key.dbname_ptr = dbname; + + /* Lookup the hash table entry with shared lock. */ + LWLockAcquire(shared_state->lock, LW_SHARED); + + entry = (StormStatsEntry *) hash_search(StatsEntryHash, &key, HASH_FIND, NULL); + if (!entry) + { + /* Must acquire exclusive lock to add a new entry. */ + LWLockRelease(shared_state->lock); + LWLockAcquire(shared_state->lock, LW_EXCLUSIVE); + entry = alloc_event_entry(&key); + } + + /* Grab the spinlock while updating the counters. */ + { + volatile StormStatsEntry *e = (volatile StormStatsEntry *) entry; + + SpinLockAcquire(&e->mutex); + + if (isConnEvent) { + e->counters.conn_cnt += 1; + } else if (isUtilEvent) { + e->counters.ddl_cnt += 1; + } else { + switch (c) + { + case CMD_SELECT: + e->counters.select_cnt += 1; + break; + case CMD_INSERT: + e->counters.insert_cnt += 1; + break; + case CMD_UPDATE: + e->counters.update_cnt += 1; + break; + case CMD_DELETE: + e->counters.delete_cnt += 1; + break; + case CMD_UTILITY: + case CMD_UNKNOWN: + case CMD_NOTHING: + break; + } + } + SpinLockRelease(&e->mutex); + } + + LWLockRelease(shared_state->lock); +} + +/* + * Gather statistics from remote coordinators + */ +static HTAB * +storm_gather_remote_coord_info(Oid funcid) +{ + bool found; + EState *estate; + TupleTableSlot *result; + RemoteQuery *step; + RemoteQueryState *node; + int i, ncolumns; + HeapTuple tp; + TupleDesc tupdesc; + MemoryContext oldcontext; + HTAB *LocalStatsHash; + HASHCTL event_ctl; + + /* + * We will sort output by database name, should make adding up info from + * multiple remote coordinators easier + */ + char *query = "SELECT * FROM storm_database_stats() ORDER BY datname"; + + /* Build up RemoteQuery */ + step = makeNode(RemoteQuery); + + step->combine_type = COMBINE_TYPE_NONE; + step->exec_nodes = NULL; + step->sql_statement = query; + step->force_autocommit = false; + step->read_only = true; + step->exec_type = EXEC_ON_COORDS; + + /* Build a local hash table to contain info from remote nodes */ + memset(&event_ctl, 0, sizeof(event_ctl)); + + event_ctl.keysize = sizeof(ssHashKey); + event_ctl.entrysize = sizeof(LocalStatsEntry); + event_ctl.hash = ss_hash_fn; + event_ctl.match = ss_match_fn; + + LocalStatsHash = hash_create("storm_stats local hash", max_tracked_dbs, + &event_ctl, + HASH_ELEM | HASH_FUNCTION | HASH_COMPARE); + if (!LocalStatsHash) + elog(ERROR, "out of memory"); + + /* + * Add targetlist entries. We use the proc oid to get the tupledesc for + * this. We could have hardcoded the types of existing set of columns, but + * if we change the columns later for whatever reasons, this keeps us sane + */ + tp = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); + + /* Build a tupdesc of all the OUT parameters */ + tupdesc = build_function_result_tupdesc_t(tp); + ncolumns = tupdesc->natts; + + for (i = 0; i < ncolumns; ++i) + { + Var *var; + TargetEntry *tle; + + var = makeVar(1, + tupdesc->attrs[i]->attnum, + tupdesc->attrs[i]->atttypid, + tupdesc->attrs[i]->atttypmod, + InvalidOid, + 0); + + tle = makeTargetEntry((Expr *) var, tupdesc->attrs[i]->attnum, NULL, false); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, tle); + } + ReleaseSysCache(tp); + + /* Execute query on the data nodes */ + estate = CreateExecutorState(); + + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + estate->es_snapshot = GetActiveSnapshot(); + + node = ExecInitRemoteQuery(step, estate, 0); + MemoryContextSwitchTo(oldcontext); + /* get ready to combine results */ + result = ExecRemoteQuery(node); + while (result != NULL && !TupIsNull(result)) + { + Datum value; + bool isnull; + ssHashKey key; + LocalStatsEntry *entry; + char *dbname; + + /* Process statistics from the coordinator nodes */ + value = slot_getattr(result, 1, &isnull); /* datname */ + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("database name must not be null"))); + + dbname = TextDatumGetCString(value); + + /* Set up key for hashtable search */ + key.dbname_len = strlen(dbname); + key.dbname_ptr = dbname; + + /* Find or create an entry with desired hash code */ + entry = (LocalStatsEntry *) hash_search(LocalStatsHash, &key, HASH_ENTER, &found); + if (!found) + { + entry->key.dbname_ptr = entry->dbname; + memset(&entry->counters, 0, sizeof(EventCounters)); + memcpy(entry->dbname, key.dbname_ptr, key.dbname_len); + entry->dbname[key.dbname_len] = '\0'; + } + + value = slot_getattr(result, 2, &isnull); /* conn_cnt */ + if (!isnull) + entry->counters.conn_cnt += DatumGetInt64(value); + + value = slot_getattr(result, 3, &isnull); /* select_cnt */ + if (!isnull) + entry->counters.select_cnt += DatumGetInt64(value); + + value = slot_getattr(result, 4, &isnull); /* insert_cnt */ + if (!isnull) + entry->counters.insert_cnt += DatumGetInt64(value); + + value = slot_getattr(result, 5, &isnull); /* update_cnt */ + if (!isnull) + entry->counters.update_cnt += DatumGetInt64(value); + + value = slot_getattr(result, 6, &isnull); /* delete_cnt */ + if (!isnull) + entry->counters.delete_cnt += DatumGetInt64(value); + + value = slot_getattr(result, 7, &isnull); /* ddl_cnt */ + if (!isnull) + entry->counters.ddl_cnt += DatumGetInt64(value); + + /* fetch next */ + result = ExecRemoteQuery(node); + } + ExecEndRemoteQuery(node); + + return LocalStatsHash; +} + +Datum storm_database_stats(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + HASH_SEQ_STATUS hash_seq; + StormStatsEntry *entry; + HTAB *LocalStatsHash = NULL; + + if (IS_PGXC_DATANODE) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("invalid invocation on data node"))); + + if (!shared_state || !StatsEntryHash) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("storm_stats must be loaded via shared_preload_libraries"))); + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not " \ + "allowed in this context"))); + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + /* + * Query the rest of the coordinators and get their stats. Do this only if + * you are query originator. Otherwise just provide your local info and + * return + */ + if (IsConnFromApp()) + LocalStatsHash = storm_gather_remote_coord_info(fcinfo->flinfo->fn_oid); + + tupdesc = CreateTemplateTupleDesc(STORM_STATS_COLS, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "dbname", TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "conn_cnt", INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "select_cnt", INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "insert_cnt", INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 5, "update_cnt", INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 6, "delete_cnt", INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 7, "ddl_cnt", INT8OID, -1, 0); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + LWLockAcquire(shared_state->lock, LW_SHARED); + + hash_seq_init(&hash_seq, StatsEntryHash); + while ((entry = hash_seq_search(&hash_seq)) != NULL) + { + Datum values[STORM_STATS_COLS]; + bool nulls[STORM_STATS_COLS]; + int i = 0; + EventCounters tmp, lcl; + + /* generate junk in short-term context */ + MemoryContextSwitchTo(oldcontext); + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + memset(&lcl, 0, sizeof(lcl)); + + values[i++] = CStringGetTextDatum(entry->dbname); + + /* copy counters to a local variable to keep locking time short */ + { + volatile StormStatsEntry *e = (volatile StormStatsEntry *) entry; + + SpinLockAcquire(&e->mutex); + tmp = e->counters; + SpinLockRelease(&e->mutex); + } + + /* See if LocalStatsHash has additional info to provide */ + if (LocalStatsHash) + { + ssHashKey key; + LocalStatsEntry *le; + bool found; + + /* Set up key for hashtable search */ + key.dbname_len = strlen(entry->dbname); + key.dbname_ptr = entry->dbname; + + /* Find an entry with desired hash code */ + le = (LocalStatsEntry *) hash_search(LocalStatsHash, &key, HASH_FIND, &found); + + /* + * What should we do if entry is not found on the other + * coordinators? WARN for now.. + */ + if (!found) + { + ereport(WARNING, + (errmsg("no stats collected from remote coordinators for database %s!", + entry->dbname))); + } + else + { + tmp.ddl_cnt += le->counters.ddl_cnt; + tmp.conn_cnt += le->counters.conn_cnt; + tmp.select_cnt += le->counters.select_cnt; + tmp.insert_cnt += le->counters.insert_cnt; + tmp.update_cnt += le->counters.update_cnt; + tmp.delete_cnt += le->counters.delete_cnt; + } + } + + values[i++] = Int64GetDatumFast(tmp.conn_cnt); + values[i++] = Int64GetDatumFast(tmp.select_cnt); + values[i++] = Int64GetDatumFast(tmp.insert_cnt); + values[i++] = Int64GetDatumFast(tmp.update_cnt); + values[i++] = Int64GetDatumFast(tmp.delete_cnt); + values[i++] = Int64GetDatumFast(tmp.ddl_cnt); + + Assert(i == STORM_STATS_COLS); + + /* switch to appropriate context while storing the tuple */ + MemoryContextSwitchTo(per_query_ctx); + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + + LWLockRelease(shared_state->lock); + + /* clean up and return the tuplestore */ + tuplestore_donestoring(tupstore); + + /* destroy local hash table */ + if (LocalStatsHash) + hash_destroy(LocalStatsHash); + + MemoryContextSwitchTo(oldcontext); + + return (Datum) 0; +} diff --git a/contrib/stormstats/stormstats.control b/contrib/stormstats/stormstats.control new file mode 100644 index 0000000000..b7816feef9 --- /dev/null +++ b/contrib/stormstats/stormstats.control @@ -0,0 +1,5 @@ +# stormstats extension +comment = 'collect deeper database stats for StormDB' +default_version = '1.0' +module_pathname = '$libdir/stormstats' +relocatable = true diff --git a/contrib/stormstats/stormstats.h b/contrib/stormstats/stormstats.h new file mode 100644 index 0000000000..c11846d0a5 --- /dev/null +++ b/contrib/stormstats/stormstats.h @@ -0,0 +1,9 @@ +#ifndef STORMSTATS_H +#define STORMSTATS_H + +#include "postgres.h" + +extern void _PG_init(void); +extern void _PG_fini(void); + +#endif /* STORMSTATS_H */ diff --git a/src/backend/Makefile b/src/backend/Makefile index 828c084ceb..611d29fa87 100644 --- a/src/backend/Makefile +++ b/src/backend/Makefile @@ -50,8 +50,8 @@ OBJS = $(SUBDIROBJS) $(LOCALOBJS) \ $(top_builddir)/src/interfaces/libpq/pqexpbuffer.o \ $(top_builddir)/src/port/libpgport_srv.a \ $(top_builddir)/src/gtm/client/libgtmclient.a \ - $(top_builddir)/src/gtm/common/libgtmcommon.a \ - $(top_builddir)/src/interfaces/libpq/libpq.a + $(top_builddir)/src/gtm/common/libgtm.a \ + $(top_builddir)/src/gtm/libpq/libpqcomm.a # We put libpgport into OBJS, so remove it from LIBS; also add libldap LIBS := $(filter-out -lpgport, $(LIBS)) $(LDAP_LIBS_BE) @@ -147,15 +147,6 @@ catalog/schemapg.h: | submake-schemapg $(top_builddir)/src/port/libpgport_srv.a: | submake-libpgport -# Those are rules to create dependent GTM libraries automatically -$(top_builddir)/src/interfaces/libpq/libpq.a: - $(MAKE) -C $(top_builddir)/src/interfaces/libpq libpq.a - -$(top_builddir)/src/gtm/common/libgtmcommon.a: - $(MAKE) -C $(top_builddir)/src/gtm/common libgtmcommon.a - -$(top_builddir)/src/gtm/client/libgtmclient.a: - $(MAKE) -C $(top_builddir)/src/gtm/client libgtmclient.a # The postgres.o target is needed by the rule in Makefile.global that # creates the exports file when MAKE_EXPORTS = true. diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index 998fc5d58d..62479c04bb 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -45,6 +45,11 @@ * and we'd like to still refer to them via C struct offsets. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -63,7 +68,9 @@ #include "access/sysattr.h" #include "access/tuptoaster.h" #include "executor/tuptable.h" - +#ifdef XCP +#include "utils/memutils.h" +#endif /* Does att's datatype allow packing into the 1-byte-header varlena format? */ #define ATT_IS_PACKABLE(att) \ @@ -1154,14 +1161,23 @@ slot_deform_datarow(TupleTableSlot *slot) int attnum; int i; int col_count; +#ifdef XCP + char *cur = slot->tts_datarow->msg; +#else char *cur = slot->tts_dataRow; +#endif StringInfo buffer; uint16 n16; uint32 n32; MemoryContext oldcontext; +#ifdef XCP + if (slot->tts_tupleDescriptor == NULL || slot->tts_datarow == NULL) + return; +#else if (slot->tts_tupleDescriptor == NULL || slot->tts_dataRow == NULL) return; +#endif attnum = slot->tts_tupleDescriptor->natts; @@ -1169,7 +1185,10 @@ slot_deform_datarow(TupleTableSlot *slot) if (slot->tts_nvalid == attnum) return; +#ifndef XCP + /* XCP: Can not happen, we return earlier if condition not true */ Assert(slot->tts_dataRow); +#endif memcpy(&n16, cur, 2); cur += 2; @@ -1180,6 +1199,30 @@ slot_deform_datarow(TupleTableSlot *slot) (errcode(ERRCODE_DATA_CORRUPTED), errmsg("Tuple does not match the descriptor"))); +#ifdef XCP + if (slot->tts_attinmeta == NULL) + { + /* + * Ensure info about input functions is available as long as slot lives + */ + oldcontext = MemoryContextSwitchTo(slot->tts_mcxt); + slot->tts_attinmeta = TupleDescGetAttInMetadata(slot->tts_tupleDescriptor); + MemoryContextSwitchTo(oldcontext); + } + + /* + * Store values to separate context to easily free them when base datarow is + * freed + */ + if (slot->tts_drowcxt == NULL) + { + slot->tts_drowcxt = AllocSetContextCreate(slot->tts_mcxt, + "Datarow", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + } +#else /* * Ensure info about input functions is available as long as slot lives * as well as deformed values @@ -1188,10 +1231,12 @@ slot_deform_datarow(TupleTableSlot *slot) if (slot->tts_attinmeta == NULL) slot->tts_attinmeta = TupleDescGetAttInMetadata(slot->tts_tupleDescriptor); +#endif buffer = makeStringInfo(); for (i = 0; i < attnum; i++) { + Form_pg_attribute attr = slot->tts_tupleDescriptor->attrs[i]; int len; /* get size */ @@ -1217,6 +1262,48 @@ slot_deform_datarow(TupleTableSlot *slot) slot->tts_isnull[i] = false; resetStringInfo(buffer); + +#ifdef XCP + /* + * The input function was executed in caller's memory context, + * because it may be allocating working memory, and caller may + * want to clean it up. + * However returned Datums need to be in the special context, so + * if attribute is pass-by-reference, copy it. + */ + if (!attr->attbyval) + { + Pointer val = DatumGetPointer(slot->tts_values[i]); + Size data_length; + void *data; + + if (attr->attlen == -1) + { + /* varlena */ + if (VARATT_IS_EXTERNAL(val)) + /* no alignment, since it's short by definition */ + data_length = VARSIZE_EXTERNAL(val); + else if (VARATT_IS_SHORT(val)) + /* no alignment for short varlenas */ + data_length = VARSIZE_SHORT(val); + else + data_length = VARSIZE(val); + } + else if (attr->attlen == -2) + { + /* cstring */ + data_length = strlen(val) + 1; + } + else + { + /* fixed-length pass-by-reference */ + data_length = attr->attlen; + } + data = MemoryContextAlloc(slot->tts_drowcxt, data_length); + memcpy(data, val, data_length); + slot->tts_values[i] = (Datum) data; + } +#endif } } pfree(buffer->data); @@ -1224,7 +1311,9 @@ slot_deform_datarow(TupleTableSlot *slot) slot->tts_nvalid = attnum; +#ifndef XCP MemoryContextSwitchTo(oldcontext); +#endif } #endif @@ -1279,7 +1368,11 @@ slot_getattr(TupleTableSlot *slot, int attnum, bool *isnull) #ifdef PGXC /* If it is a data row tuple extract all and return requested */ +#ifdef XCP + if (slot->tts_datarow) +#else if (slot->tts_dataRow) +#endif { slot_deform_datarow(slot); *isnull = slot->tts_isnull[attnum - 1]; @@ -1359,7 +1452,11 @@ slot_getallattrs(TupleTableSlot *slot) #ifdef PGXC /* Handle the DataRow tuple case */ +#ifdef XCP + if (slot->tts_datarow) +#else if (slot->tts_dataRow) +#endif { slot_deform_datarow(slot); return; @@ -1411,7 +1508,11 @@ slot_getsomeattrs(TupleTableSlot *slot, int attnum) #ifdef PGXC /* Handle the DataRow tuple case */ +#ifdef XCP + if (slot->tts_datarow) +#else if (slot->tts_dataRow) +#endif { slot_deform_datarow(slot); return; @@ -1487,7 +1588,11 @@ slot_attisnull(TupleTableSlot *slot, int attnum) #ifdef PGXC /* If it is a data row tuple extract all and return requested */ +#ifdef XCP + if (slot->tts_datarow) +#else if (slot->tts_dataRow) +#endif { slot_deform_datarow(slot); return slot->tts_isnull[attnum - 1]; diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c index a1a49f987d..10a45a3146 100644 --- a/src/backend/access/common/printtup.c +++ b/src/backend/access/common/printtup.c @@ -5,6 +5,11 @@ * clients and standalone backends are supported here). * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -314,12 +319,20 @@ printtup(TupleTableSlot *slot, DestReceiver *self) * values, just send over the DataRow message as we received it from the * Datanode */ +#ifdef XCP + if (slot->tts_datarow) + { + pq_putmessage('D', slot->tts_datarow->msg, slot->tts_datarow->msglen); + return; + } +#else if (slot->tts_dataRow) { pq_putmessage('D', slot->tts_dataRow, slot->tts_dataLen); return; } #endif +#endif /* Set or update my derived attribute info, if needed */ if (myState->attrinfo != typeinfo || myState->nattrs != natts) diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 882fa6776b..d0dd340a71 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -23,6 +23,11 @@ * for aborts (whether sync or async), since the post-crash assumption would * be that such transactions failed anyway. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -353,10 +358,21 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i * Current state change should be from 0 or subcommitted to target state * or we should already be there when replaying changes during recovery. */ +#ifdef XCP + if (!(curval == 0 || + (curval == TRANSACTION_STATUS_SUB_COMMITTED && + status != TRANSACTION_STATUS_IN_PROGRESS) || + curval == status)) + { + elog(WARNING, "Unexpected clog condition. curval = %d, status = %d", + curval, status); + } +#else Assert(curval == 0 || (curval == TRANSACTION_STATUS_SUB_COMMITTED && status != TRANSACTION_STATUS_IN_PROGRESS) || curval == status); +#endif /* note this assumes exclusive access to the clog page */ byteval = *byteptr; diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index b425ce434d..bcfc931b17 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -18,10 +18,24 @@ #include "utils/elog.h" #include "miscadmin.h" #include "pgxc/pgxc.h" - +#ifdef XCP +#include "postgres.h" +#include "gtm/gtm_c.h" +#include "postmaster/autovacuum.h" +#include "storage/backendid.h" +#include "utils/lsyscache.h" + +/* To access sequences */ +#define MyCoordName \ + OidIsValid(MyCoordId) ? get_pgxc_nodename(MyCoordId) : "" +#endif /* Configuration variables */ char *GtmHost = "localhost"; int GtmPort = 6666; +#ifdef XCP +bool IsXidFromGTM = false; +#endif + extern bool FirstSnapshotSet; static GTM_Conn *conn; @@ -97,6 +111,11 @@ InitGTM(void) CloseGTM(); } + +#ifdef XCP + else if (IS_PGXC_COORDINATOR) + register_session(conn, PGXCNodeName, MyProcPid, MyBackendId); +#endif } void @@ -136,6 +155,10 @@ BeginTranGTM(GTM_Timestamp *timestamp) if (conn) xid = begin_transaction(conn, GTM_ISOLATION_RC, timestamp); } +#ifdef XCP + if (xid) + IsXidFromGTM = true; +#endif currentGxid = xid; return xid; } @@ -173,6 +196,10 @@ CommitTranGTM(GlobalTransactionId gxid) if (!GlobalTransactionIdIsValid(gxid)) return 0; CheckConnection(); +#ifdef XCP + ret = -1; + if (conn) +#endif ret = commit_transaction(conn, gxid); /* @@ -184,6 +211,10 @@ CommitTranGTM(GlobalTransactionId gxid) { CloseGTM(); InitGTM(); +#ifdef XCP + if (conn) + ret = commit_transaction(conn, gxid); +#endif } /* Close connection in case commit is done by autovacuum worker or launcher */ @@ -206,6 +237,10 @@ CommitPreparedTranGTM(GlobalTransactionId gxid, GlobalTransactionId prepared_gxi if (!GlobalTransactionIdIsValid(gxid) || !GlobalTransactionIdIsValid(prepared_gxid)) return ret; CheckConnection(); +#ifdef XCP + ret = -1; + if (conn) +#endif ret = commit_prepared_transaction(conn, gxid, prepared_gxid); /* @@ -218,6 +253,10 @@ CommitPreparedTranGTM(GlobalTransactionId gxid, GlobalTransactionId prepared_gxi { CloseGTM(); InitGTM(); +#ifdef XCP + if (conn) + ret = commit_prepared_transaction(conn, gxid, prepared_gxid); +#endif } currentGxid = InvalidGlobalTransactionId; return ret; @@ -244,6 +283,10 @@ RollbackTranGTM(GlobalTransactionId gxid) { CloseGTM(); InitGTM(); +#ifdef XCP + if (conn) + ret = abort_transaction(conn, gxid); +#endif } currentGxid = InvalidGlobalTransactionId; @@ -261,6 +304,10 @@ StartPreparedTranGTM(GlobalTransactionId gxid, return 0; CheckConnection(); +#ifdef XCP + ret = -1; + if (conn) +#endif ret = start_prepared_transaction(conn, gxid, gid, nodestring); /* @@ -272,6 +319,10 @@ StartPreparedTranGTM(GlobalTransactionId gxid, { CloseGTM(); InitGTM(); +#ifdef XCP + if (conn) + ret = start_prepared_transaction(conn, gxid, gid, nodestring); +#endif } return ret; @@ -285,6 +336,10 @@ PrepareTranGTM(GlobalTransactionId gxid) if (!GlobalTransactionIdIsValid(gxid)) return 0; CheckConnection(); +#ifdef XCP + ret = -1; + if (conn) +#endif ret = prepare_transaction(conn, gxid); /* @@ -296,6 +351,10 @@ PrepareTranGTM(GlobalTransactionId gxid) { CloseGTM(); InitGTM(); +#ifdef XCP + if (conn) + ret = prepare_transaction(conn, gxid); +#endif } currentGxid = InvalidGlobalTransactionId; return ret; @@ -311,6 +370,10 @@ GetGIDDataGTM(char *gid, int ret = 0; CheckConnection(); +#ifdef XCP + ret = -1; + if (conn) +#endif ret = get_gid_data(conn, GTM_ISOLATION_RC, gid, gxid, prepared_gxid, nodestring); @@ -323,6 +386,11 @@ GetGIDDataGTM(char *gid, { CloseGTM(); InitGTM(); +#ifdef XCP + if (conn) + ret = get_gid_data(conn, GTM_ISOLATION_RC, gid, gxid, + prepared_gxid, nodestring); +#endif } return ret; @@ -339,6 +407,10 @@ GetSnapshotGTM(GlobalTransactionId gxid, bool canbe_grouped) { CloseGTM(); InitGTM(); +#ifdef XCP + if (conn) + ret_snapshot = get_snapshot(conn, gxid, canbe_grouped); +#endif } return ret_snapshot; } @@ -374,26 +446,105 @@ AlterSequenceGTM(char *seqname, GTM_Sequence increment, GTM_Sequence minval, return conn ? alter_sequence(conn, &seqkey, increment, minval, maxval, startval, lastval, cycle, is_restart) : 0; } +/* + * get the current sequence value + */ + +GTM_Sequence +GetCurrentValGTM(char *seqname) +{ + GTM_Sequence ret = -1; + GTM_SequenceKeyData seqkey; +#ifdef XCP + char *coordName = IS_PGXC_COORDINATOR ? PGXCNodeName : MyCoordName; + int coordPid = IS_PGXC_COORDINATOR ? MyProcPid : MyCoordPid; + int status; +#endif + CheckConnection(); + seqkey.gsk_keylen = strlen(seqname) + 1; + seqkey.gsk_key = seqname; + +#ifdef XCP + if (conn) + status = get_current(conn, &seqkey, coordName, coordPid, &ret); + else + status = GTM_RESULT_COMM_ERROR; + + /* retry once */ + if (status == GTM_RESULT_COMM_ERROR) + { + CloseGTM(); + InitGTM(); + if (conn) + status = get_current(conn, &seqkey, coordName, coordPid, &ret); + } + if (status != GTM_RESULT_OK) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("%s", GTMPQerrorMessage(conn)))); +#else + if (conn) + ret = get_current(conn, &seqkey); + + if (ret < 0) + { + CloseGTM(); + InitGTM(); + } +#endif + return ret; +} /* * Get the next sequence value */ GTM_Sequence +#ifdef XCP +GetNextValGTM(char *seqname, GTM_Sequence range, GTM_Sequence *rangemax) +#else GetNextValGTM(char *seqname) +#endif { GTM_Sequence ret = -1; GTM_SequenceKeyData seqkey; +#ifdef XCP + char *coordName = IS_PGXC_COORDINATOR ? PGXCNodeName : MyCoordName; + int coordPid = IS_PGXC_COORDINATOR ? MyProcPid : MyCoordPid; + int status; +#endif CheckConnection(); seqkey.gsk_keylen = strlen(seqname) + 1; seqkey.gsk_key = seqname; +#ifdef XCP + if (conn) + status = get_next(conn, &seqkey, coordName, + coordPid, range, &ret, rangemax); + else + status = GTM_RESULT_COMM_ERROR; + + /* retry once */ + if (status == GTM_RESULT_COMM_ERROR) + { + CloseGTM(); + InitGTM(); + if (conn) + status = get_next(conn, &seqkey, coordName, coordPid, + range, &ret, rangemax); + } + if (status != GTM_RESULT_OK) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("%s", GTMPQerrorMessage(conn)))); +#else if (conn) - ret = get_next(conn, &seqkey); + ret = get_next(conn, &seqkey); if (ret < 0) { CloseGTM(); InitGTM(); } +#endif return ret; } @@ -404,11 +555,19 @@ int SetValGTM(char *seqname, GTM_Sequence nextval, bool iscalled) { GTM_SequenceKeyData seqkey; +#ifdef XCP + char *coordName = IS_PGXC_COORDINATOR ? PGXCNodeName : MyCoordName; + int coordPid = IS_PGXC_COORDINATOR ? MyProcPid : MyCoordPid; +#endif CheckConnection(); seqkey.gsk_keylen = strlen(seqname) + 1; seqkey.gsk_key = seqname; +#ifdef XCP + return conn ? set_val(conn, &seqkey, coordName, coordPid, nextval, iscalled) : -1; +#else return conn ? set_val(conn, &seqkey, nextval, iscalled) : -1; +#endif } /* diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 768781edf9..5fe2369344 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -3,6 +3,11 @@ * varsup.c * postgres OID & XID variables support routines * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Copyright (c) 2000-2012, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -72,6 +77,33 @@ GetForceXidFromGTM(void) } #endif /* PGXC */ + +#ifdef XCP +/* + * Check if GlobalTransactionId associated with the current distributed session + * equals to specified xid. + * It is for tuple visibility checks in secondary datanode sessions, which are + * not associating next_xid with the current transaction. + */ +bool +TransactionIdIsCurrentGlobalTransactionId(TransactionId xid) +{ + return TransactionIdIsValid(next_xid) && TransactionIdEquals(xid, next_xid); +} + + +/* + * Returns GlobalTransactionId associated with the current distributed session + * without assigning it to the transaction. + */ +TransactionId +GetNextTransactionId(void) +{ + return next_xid; +} +#endif + + /* * Allocate the next XID for a new transaction or subtransaction. * @@ -87,15 +119,17 @@ TransactionId GetNewTransactionId(bool isSubXact, bool *timestamp_received, GTM_Timestamp *timestamp) #else GetNewTransactionId(bool isSubXact) -#endif +#endif /* PGXC */ { TransactionId xid; #ifdef PGXC bool increment_xid = true; - *timestamp_received = false; +#ifdef XCP + /* Will be set if we obtain from GTM */ + IsXidFromGTM = false; #endif - +#endif /* PGXC */ /* * During bootstrap initialization, we return the special bootstrap * transaction id. @@ -128,7 +162,7 @@ GetNewTransactionId(bool isSubXact) xid = (TransactionId) BeginTranGTM(timestamp); *timestamp_received = true; } -#endif +#endif /* PGXC */ LWLockAcquire(XidGenLock, LW_EXCLUSIVE); @@ -165,8 +199,16 @@ GetNewTransactionId(bool isSubXact) } } else if(IS_PGXC_DATANODE || IsConnFromCoord()) - { + { +#ifdef XCP + /* + * (IS_PGXC_DATANODE && IsInitProcessingMode() && IsPostmasterEnvironment) + * handles new connections, ensures XID is consumed then, but not during initdb + */ + if (IsAutoVacuumWorkerProcess() || IsAutoVacuumLauncherProcess() || (IS_PGXC_DATANODE && IsInitProcessingMode() && IsPostmasterEnvironment)) +#else if (IsAutoVacuumWorkerProcess()) +#endif { /* * For an autovacuum worker process, get transaction ID directly from GTM. @@ -180,7 +222,7 @@ GetNewTransactionId(bool isSubXact) next_xid = (TransactionId) BeginTranGTM(timestamp); } else if (GetForceXidFromGTM()) - { + { elog (DEBUG1, "Force get XID from GTM"); /* try and get gxid directly from GTM */ next_xid = (TransactionId) BeginTranGTM(NULL); @@ -204,12 +246,16 @@ GetNewTransactionId(bool isSubXact) } else ShmemVariableCache->nextXid = xid; - } - else + } + else { - /* Fallback to default */ - elog(LOG, "Falling back to local Xid. Was = %d, now is = %d", - next_xid, ShmemVariableCache->nextXid); + if (IsConnFromCoord()) + { + elog(ERROR, "Coordinator has not provided xid for the command"); + } + /* Fallback to default, needed for initdb */ + elog(LOG, "Falling back to local Xid. Was = %d, now is = %d. autovacLaunch = %d", + next_xid, ShmemVariableCache->nextXid, IsAutoVacuumLauncherProcess()); xid = ShmemVariableCache->nextXid; } } @@ -217,7 +263,6 @@ GetNewTransactionId(bool isSubXact) xid = ShmemVariableCache->nextXid; #endif /* PGXC */ - /*---------- * Check to see if it's safe to assign another XID. This protects against * catastrophic data loss due to XID wraparound. The basic rules are: diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index d55005f43f..055bd97f18 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -5,6 +5,11 @@ * * See src/backend/access/transam/README for more information. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -24,10 +29,12 @@ #ifdef PGXC #include "pgxc/pgxc.h" #include "access/gtm.h" -#include "pgxc/xc_maintenance_mode.h" /* PGXC_COORD */ #include "gtm/gtm_c.h" #include "pgxc/execRemote.h" +#ifdef XCP +#include "pgxc/pause.h" +#endif /* PGXC_DATANODE */ #include "postmaster/autovacuum.h" #include "libpq/pqformat.h" @@ -67,6 +74,11 @@ #include "pg_trace.h" +#ifdef XCP +#define implicit2PC_head "_$XC$" +#endif + + /* * User-tweakable parameters */ @@ -148,8 +160,10 @@ typedef struct TransactionStateData GlobalTransactionId transactionId; GlobalTransactionId topGlobalTransansactionId; GlobalTransactionId auxilliaryTransactionId; +#ifndef XCP bool isLocalParameterUsed; /* Check if a local parameter is active * in transaction block (SET LOCAL, DEFERRED) */ +#endif #else TransactionId transactionId; /* my XID, or Invalid if none */ #endif @@ -184,7 +198,9 @@ static TransactionStateData TopTransactionStateData = { 0, /* global transaction id */ 0, /* prepared global transaction id */ 0, /* commit prepared global transaction id */ +#ifndef XCP false, /* isLocalParameterUsed */ +#endif #else 0, /* transaction id */ #endif @@ -270,6 +286,9 @@ static TimestampTz GTMdeltaTimestamp = 0; */ static char *prepareGID; static char *savePrepareGID; +#ifdef XCP +static char *saveNodeString = NULL; +#endif static bool XactLocalNodePrepared; static bool XactReadLocalNode; static bool XactWriteLocalNode; @@ -455,6 +474,15 @@ GetCurrentTransactionId(void) { TransactionState s = CurrentTransactionState; +#ifdef XCP + /* + * Never assign xid to the secondary session, that causes conflicts when + * writing to the clog at the transaction end. + */ + if (IsConnFromDatanode()) + return GetNextTransactionId(); +#endif + if (!TransactionIdIsValid(s->transactionId)) AssignTransactionId(s); return s->transactionId; @@ -501,6 +529,7 @@ GetStableLatestTransactionId(void) } #ifdef PGXC +#ifndef XCP /* * GetCurrentLocalParamStatus * @@ -524,6 +553,7 @@ SetCurrentLocalParamStatus(bool status) CurrentTransactionState->isLocalParameterUsed = status; } #endif +#endif /* * AssignTransactionId @@ -595,7 +625,7 @@ AssignTransactionId(TransactionState s) } #else s->transactionId = GetNewTransactionId(isSubXact); -#endif +#endif /* PGXC */ if (isSubXact) SubTransSetParent(s->transactionId, s->parent->transactionId, false); @@ -722,6 +752,7 @@ GetCurrentSubTransactionId(void) return s->subTransactionId; } + /* * GetCurrentCommandId * @@ -735,7 +766,11 @@ GetCurrentCommandId(bool used) { #ifdef PGXC /* If coordinator has sent a command id, remote node should use it */ +#ifdef XCP + if (isCommandIdReceived) +#else if (IsConnFromCoord() && isCommandIdReceived) +#endif { /* * Indicate to successive calls of this function that the sent command id has @@ -910,6 +945,16 @@ TransactionIdIsCurrentTransactionId(TransactionId xid) if (!TransactionIdIsNormal(xid)) return false; +#ifdef XCP + /* + * The current TransactionId of secondary datanode session is never + * associated with the current transaction, so if it is a secondary + * Datanode session look into xid sent from the parent. + */ + if (IsConnFromDatanode() && TransactionIdIsCurrentGlobalTransactionId(xid)) + return true; +#endif + /* * We will return true for the Xid of the current subtransaction, any of * its subcommitted children, any of its parents, or any of their @@ -1968,8 +2013,10 @@ StartTransaction(void) */ s->state = TRANS_START; #ifdef PGXC +#ifndef XCP s->isLocalParameterUsed = false; #endif +#endif s->transactionId = InvalidTransactionId; /* until assigned */ /* * Make sure we've reset xact state variables @@ -1990,8 +2037,7 @@ StartTransaction(void) XactReadOnly = DefaultXactReadOnly; #ifdef PGXC /* Save Postgres-XC session as read-only if necessary */ - if (!xc_maintenance_mode) - XactReadOnly |= IsPGXCNodeXactReadOnly(); + XactReadOnly |= IsPGXCNodeXactReadOnly(); #endif } XactDeferrable = DefaultXactDeferrable; @@ -2153,6 +2199,15 @@ CommitTransaction(void) savePrepareGID = NULL; } +#ifdef XCP + if (saveNodeString) + { + pfree(saveNodeString); + saveNodeString = NULL; + } +#endif + +#ifndef XCP /* * Check if there are any ON COMMIT actions or if temporary objects are in use. * If session is set-up to enforce 2PC for such transactions, return an error. @@ -2168,16 +2223,28 @@ CommitTransaction(void) errmsg("cannot PREPARE a transaction that has operated on temporary tables"), errdetail("Disabling enforce_two_phase_commit is recommended to enforce COMMIT"))); } +#endif /* * If the local node has done some write activity, prepare the local node * first. If that fails, the transaction is aborted on all the remote * nodes */ +#ifdef XCP + /* + * Fired OnCommit actions would fail 2PC process + */ + if (!IsOnCommitActions() && IsTwoPhaseCommitRequired(XactWriteLocalNode)) +#else if (IsTwoPhaseCommitRequired(XactWriteLocalNode)) +#endif { prepareGID = MemoryContextAlloc(TopTransactionContext, 256); +#ifdef XCP + sprintf(prepareGID, implicit2PC_head"%u", GetTopTransactionId()); +#else sprintf(prepareGID, "T%u", GetTopTransactionId()); +#endif savePrepareGID = MemoryContextStrdup(TopMemoryContext, prepareGID); @@ -2205,7 +2272,14 @@ CommitTransaction(void) s->auxilliaryTransactionId = GetTopTransactionId(); } else +#ifdef XCP + { + s->auxilliaryTransactionId = InvalidGlobalTransactionId; + PrePrepare_Remote(prepareGID, false, true); + } +#else s->auxilliaryTransactionId = InvalidGlobalTransactionId; +#endif } } #endif @@ -2266,14 +2340,21 @@ CommitTransaction(void) PreCommit_Notify(); #ifdef PGXC +#ifdef XCP + if (IS_PGXC_DATANODE || !IsConnFromCoord()) +#else if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) +#endif { /* * Now run 2PC on the remote nodes. Any errors will be reported via * ereport and we will run error recovery as part of AbortTransaction */ +#ifdef XCP + PreCommit_Remote(savePrepareGID, saveNodeString, XactLocalNodePrepared); +#else PreCommit_Remote(savePrepareGID, XactLocalNodePrepared); - +#endif /* * Now that all the remote nodes have successfully prepared and * commited, commit the local transaction as well. Remember, any errors @@ -2392,6 +2473,16 @@ CommitTransaction(void) AtEOXact_MultiXact(); +#ifdef XCP + /* If the cluster lock was held at commit time, keep it locked! */ + if (cluster_ex_lock_held) + { + elog(DEBUG2, "PAUSE CLUSTER still held at commit"); + /*if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + RequestClusterPause(false, NULL);*/ + } +#endif + ResourceOwnerRelease(TopTransactionResourceOwner, RESOURCE_RELEASE_LOCKS, true, true); @@ -2436,7 +2527,9 @@ CommitTransaction(void) s->maxChildXids = 0; #ifdef PGXC +#ifndef XCP s->isLocalParameterUsed = false; +#endif ForgetTransactionLocalNode(); /* @@ -2506,6 +2599,27 @@ AtEOXact_GlobalTxn(bool commit) RollbackTranGTM(s->topGlobalTransansactionId); } } +#ifdef XCP + /* + * If GTM is connected the current gxid is acquired from GTM directly. + * So directly report transaction end. However this applies only if + * the connection is directly from a client. + */ + else if (IsXidFromGTM) + { + IsXidFromGTM = false; + if (commit) + CommitTranGTM(s->topGlobalTransansactionId); + else + RollbackTranGTM(s->topGlobalTransansactionId); + + if (IsGTMConnected() && + !IsConnFromCoord() && !IsConnFromDatanode()) + { + CloseGTM(); + } + } +#else else if (IS_PGXC_DATANODE || IsConnFromCoord()) { /* If we are autovacuum, commit on GTM */ @@ -2525,7 +2639,7 @@ AtEOXact_GlobalTxn(bool commit) RollbackTranGTM(currentGxid); } } - +#endif s->topGlobalTransansactionId = InvalidGlobalTransactionId; s->auxilliaryTransactionId = InvalidGlobalTransactionId; @@ -2552,8 +2666,10 @@ PrepareTransaction(void) TimestampTz prepared_at; #ifdef PGXC bool isImplicit = !(s->blockState == TBLOCK_PREPARE); +#ifndef XCP char *nodestring = NULL; #endif +#endif ShowTransactionState("PrepareTransaction"); @@ -2566,6 +2682,7 @@ PrepareTransaction(void) Assert(s->parent == NULL); #ifdef PGXC +#ifndef XCP if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { if (savePrepareGID) @@ -2581,6 +2698,7 @@ PrepareTransaction(void) CallGTMCallbacks(GTM_EVENT_PREPARE); } #endif +#endif /* * Do pre-commit processing that involves calling user-defined code, such @@ -2604,6 +2722,35 @@ PrepareTransaction(void) break; } +#ifdef XCP + /* + * Remote nodes must be done AFTER portals. If portal is still active it may + * need to send down a message to close remote objects on Datanode, but + * PrePrepare_Remote releases connections to remote nodes. + */ + if (IS_PGXC_DATANODE || !IsConnFromCoord()) + { + char *nodestring; + if (saveNodeString) + pfree(saveNodeString); + + /* Needed in PrePrepare_Remote to submit nodes to GTM */ + s->topGlobalTransansactionId = s->transactionId; + if (savePrepareGID) + pfree(savePrepareGID); + savePrepareGID = MemoryContextStrdup(TopMemoryContext, prepareGID); + nodestring = PrePrepare_Remote(savePrepareGID, XactWriteLocalNode, isImplicit); + if (nodestring) + saveNodeString = MemoryContextStrdup(TopMemoryContext, nodestring); + + /* + * Callback on GTM if necessary, this needs to be done before HOLD_INTERRUPTS + * as this is not a part of the end of transaction processing involving clean up. + */ + CallGTMCallbacks(GTM_EVENT_PREPARE); + } +#endif + /* * The remaining actions cannot call any user-defined code, so it's safe * to start shutting down within-transaction services. But note that most @@ -2827,7 +2974,11 @@ PrepareTransaction(void) */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { +#ifdef XCP + PostPrepare_Remote(savePrepareGID, isImplicit); +#else PostPrepare_Remote(savePrepareGID, nodestring, isImplicit); +#endif if (!isImplicit) s->topGlobalTransansactionId = InvalidGlobalTransactionId; ForgetTransactionLocalNode(); @@ -5798,7 +5949,9 @@ IsTransactionLocalNode(bool write) bool IsXidImplicit(const char *xid) { +#ifndef XCP #define implicit2PC_head "_$XC$" +#endif const size_t implicit2PC_head_len = strlen(implicit2PC_head); if (strncmp(xid, implicit2PC_head, implicit2PC_head_len)) @@ -5820,7 +5973,9 @@ SaveReceivedCommandId(CommandId cid) * Change command ID information status to report any changes in remote ID * for a remote node. A new command ID has also been received. */ +#ifndef XCP if (IsConnFromCoord()) +#endif { SetSendCommandId(true); isCommandIdReceived = true; @@ -5899,9 +6054,9 @@ IsPGXCNodeXactReadOnly(void) * For the time being a Postgres-XC session is read-only * under very specific conditions. * This is the case of an application accessing directly - * a Datanode. + * a Datanode provided the server was not started in restore mode. */ - return IsPGXCNodeXactDatanodeDirect(); + return IsPGXCNodeXactDatanodeDirect() && !isRestoreMode; } /* @@ -5929,6 +6084,9 @@ IsPGXCNodeXactDatanodeDirect(void) (IsPostmasterEnvironment || !useLocalXid) && IsNormalProcessingMode() && !IsAutoVacuumLauncherProcess() && +#ifdef XCP + !IsConnFromDatanode() && +#endif !IsConnFromCoord(); } #endif diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 841ebec63b..03ff91e046 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -4,6 +4,11 @@ * routines to support running postgres in 'bootstrap' mode * bootstrap mode is used to create the initial template database * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index fb3ca97994..ec884289cb 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -72,6 +72,7 @@ install-data: $(BKIFILES) installdirs $(INSTALL_DATA) $(call vpathsearch,postgres.shdescription) '$(DESTDIR)$(datadir)/postgres.shdescription' $(INSTALL_DATA) $(srcdir)/system_views.sql '$(DESTDIR)$(datadir)/system_views.sql' $(INSTALL_DATA) $(srcdir)/information_schema.sql '$(DESTDIR)$(datadir)/information_schema.sql' + $(INSTALL_DATA) $(srcdir)/storm_catalog.sql '$(DESTDIR)$(datadir)/storm_catalog.sql' $(INSTALL_DATA) $(srcdir)/sql_features.txt '$(DESTDIR)$(datadir)/sql_features.txt' installdirs: diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index 96aba2e0ae..c13a3f39f6 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -5,6 +5,11 @@ * bits of hard-wired knowledge * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -602,6 +607,11 @@ GetNewRelFileNode(Oid reltablespace, Relation pg_class, char relpersistence) switch (relpersistence) { case RELPERSISTENCE_TEMP: +#ifdef XCP + if (OidIsValid(MyCoordId)) + backend = MyFirstBackendId; + else +#endif backend = MyBackendId; break; case RELPERSISTENCE_UNLOGGED: diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 29f324369e..c6637e0f71 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -4,6 +4,11 @@ * Routines to support inter-object dependencies. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -446,7 +451,7 @@ performRename(const ObjectAddress *object, const char *oldname, const char *newn NULL, /* empty stack */ targetObjects, NULL, - depRel); + &depRel); /* Check Objects one by one to see if some of them have to be renamed on GTM */ for (i = 0; i < targetObjects->numrefs; i++) @@ -1264,6 +1269,7 @@ doDeletion(const ObjectAddress *object, int flags) break; case RELKIND_RELATION: case RELKIND_VIEW: +#ifndef XCP /* * Flag temporary objects in use in case a temporary table or view * is dropped by dependency. This check is particularly useful with @@ -1273,6 +1279,7 @@ doDeletion(const ObjectAddress *object, int flags) */ if (IsTempTable(object->objectId)) ExecSetTempObjectIncluded(); +#endif break; default: break; diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl index e1e5374884..adb3d81cec 100644 --- a/src/backend/catalog/genbki.pl +++ b/src/backend/catalog/genbki.pl @@ -220,7 +220,7 @@ foreach my $catname ( @{ $catalogs->{names} } ) {cmax => 'cid'}, {tableoid => 'oid'} #PGXC_BEGIN - ,{xc_node_id => 'int4'} + ,{ xc_node_id => 'int4' } #PGXC_END ); foreach my $attr (@SYS_ATTRS) diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index e5d05418bd..6741f90a3f 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -77,6 +77,7 @@ #include "pgxc/nodemgr.h" #include "pgxc/pgxc.h" #include "pgxc/pgxcnode.h" +#include "pgxc/postgresql_fdw.h" #endif @@ -1033,7 +1034,7 @@ GetRelationDistributionItems(Oid relid, for (i = 0; i < descriptor->natts; i++) { attr = descriptor->attrs[i]; - if (IsTypeDistributable(attr->atttypid)) + if (IsTypeHashDistributable(attr->atttypid)) { /* distribute on this column */ local_attnum = i + 1; @@ -1065,7 +1066,7 @@ GetRelationDistributionItems(Oid relid, errmsg("Invalid distribution column specified"))); } - if (!IsTypeDistributable(descriptor->attrs[local_attnum - 1]->atttypid)) + if (!IsTypeHashDistributable(descriptor->attrs[local_attnum - 1]->atttypid)) { ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -1088,7 +1089,7 @@ GetRelationDistributionItems(Oid relid, errmsg("Invalid distribution column specified"))); } - if (!IsTypeDistributable(descriptor->attrs[local_attnum - 1]->atttypid)) + if (!IsTypeModuloDistributable(descriptor->attrs[local_attnum - 1]->atttypid)) { ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c index 997a6f35cd..dd5a9da65b 100644 --- a/src/backend/catalog/namespace.c +++ b/src/backend/catalog/namespace.c @@ -9,6 +9,11 @@ * and implementing search-path-controlled searches. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -47,6 +52,9 @@ #include "parser/parse_func.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#ifdef XCP +#include "storage/proc.h" +#endif #include "storage/sinval.h" #include "utils/acl.h" #include "utils/builtins.h" @@ -196,6 +204,9 @@ static void RemoveTempRelationsCallback(int code, Datum arg); static void NamespaceCallback(Datum arg, int cacheid, uint32 hashvalue); static bool MatchNamedCall(HeapTuple proctup, int nargs, List *argnames, int **argnumbers); +#ifdef XCP +static void FindTemporaryNamespace(void); +#endif /* These don't really need to appear in any header file */ Datum pg_table_is_visible(PG_FUNCTION_ARGS); @@ -704,7 +715,11 @@ RelationIsVisible(Oid relid) * list_member_oid() for them. */ relnamespace = relform->relnamespace; +#ifdef XCP + if (relnamespace != PG_CATALOG_NAMESPACE && relnamespace != STORM_CATALOG_NAMESPACE && +#else if (relnamespace != PG_CATALOG_NAMESPACE && +#endif !list_member_oid(activeSearchPath, relnamespace)) visible = false; else @@ -799,7 +814,11 @@ TypeIsVisible(Oid typid) * list_member_oid() for them. */ typnamespace = typform->typnamespace; +#ifdef XCP + if (typnamespace != PG_CATALOG_NAMESPACE && typnamespace != STORM_CATALOG_NAMESPACE && +#else if (typnamespace != PG_CATALOG_NAMESPACE && +#endif !list_member_oid(activeSearchPath, typnamespace)) visible = false; else @@ -1389,7 +1408,11 @@ FunctionIsVisible(Oid funcid) * list_member_oid() for them. */ pronamespace = procform->pronamespace; +#ifdef XCP + if (pronamespace != PG_CATALOG_NAMESPACE && pronamespace != STORM_CATALOG_NAMESPACE && +#else if (pronamespace != PG_CATALOG_NAMESPACE && +#endif !list_member_oid(activeSearchPath, pronamespace)) visible = false; else @@ -1713,7 +1736,11 @@ OperatorIsVisible(Oid oprid) * list_member_oid() for them. */ oprnamespace = oprform->oprnamespace; +#ifdef XCP + if (oprnamespace != PG_CATALOG_NAMESPACE && oprnamespace != STORM_CATALOG_NAMESPACE && +#else if (oprnamespace != PG_CATALOG_NAMESPACE && +#endif !list_member_oid(activeSearchPath, oprnamespace)) visible = false; else @@ -1799,7 +1826,11 @@ OpclassIsVisible(Oid opcid) * list_member_oid() for them. */ opcnamespace = opcform->opcnamespace; +#ifdef XCP + if (opcnamespace != PG_CATALOG_NAMESPACE && opcnamespace != STORM_CATALOG_NAMESPACE && +#else if (opcnamespace != PG_CATALOG_NAMESPACE && +#endif !list_member_oid(activeSearchPath, opcnamespace)) visible = false; else @@ -1882,7 +1913,11 @@ OpfamilyIsVisible(Oid opfid) * list_member_oid() for them. */ opfnamespace = opfform->opfnamespace; +#ifdef XCP + if (opfnamespace != PG_CATALOG_NAMESPACE && opfnamespace != STORM_CATALOG_NAMESPACE && +#else if (opfnamespace != PG_CATALOG_NAMESPACE && +#endif !list_member_oid(activeSearchPath, opfnamespace)) visible = false; else @@ -1972,7 +2007,11 @@ CollationIsVisible(Oid collid) * list_member_oid() for them. */ collnamespace = collform->collnamespace; +#ifdef XCP + if (collnamespace != PG_CATALOG_NAMESPACE && collnamespace != STORM_CATALOG_NAMESPACE && +#else if (collnamespace != PG_CATALOG_NAMESPACE && +#endif !list_member_oid(activeSearchPath, collnamespace)) visible = false; else @@ -2054,7 +2093,11 @@ ConversionIsVisible(Oid conid) * list_member_oid() for them. */ connamespace = conform->connamespace; +#ifdef XCP + if (connamespace != PG_CATALOG_NAMESPACE && connamespace != STORM_CATALOG_NAMESPACE && +#else if (connamespace != PG_CATALOG_NAMESPACE && +#endif !list_member_oid(activeSearchPath, connamespace)) visible = false; else @@ -2156,7 +2199,11 @@ TSParserIsVisible(Oid prsId) * list_member_oid() for them. */ namespace = form->prsnamespace; +#ifdef XCP + if (namespace != PG_CATALOG_NAMESPACE && namespace != STORM_CATALOG_NAMESPACE && +#else if (namespace != PG_CATALOG_NAMESPACE && +#endif !list_member_oid(activeSearchPath, namespace)) visible = false; else @@ -2280,7 +2327,11 @@ TSDictionaryIsVisible(Oid dictId) * list_member_oid() for them. */ namespace = form->dictnamespace; +#ifdef XCP + if (namespace != PG_CATALOG_NAMESPACE && namespace != STORM_CATALOG_NAMESPACE && +#else if (namespace != PG_CATALOG_NAMESPACE && +#endif !list_member_oid(activeSearchPath, namespace)) visible = false; else @@ -2403,7 +2454,11 @@ TSTemplateIsVisible(Oid tmplId) * list_member_oid() for them. */ namespace = form->tmplnamespace; +#ifdef XCP + if (namespace != PG_CATALOG_NAMESPACE && namespace != STORM_CATALOG_NAMESPACE && +#else if (namespace != PG_CATALOG_NAMESPACE && +#endif !list_member_oid(activeSearchPath, namespace)) visible = false; else @@ -2527,7 +2582,11 @@ TSConfigIsVisible(Oid cfgid) * list_member_oid() for them. */ namespace = form->cfgnamespace; +#ifdef XCP + if (namespace != PG_CATALOG_NAMESPACE && namespace != STORM_CATALOG_NAMESPACE && +#else if (namespace != PG_CATALOG_NAMESPACE && +#endif !list_member_oid(activeSearchPath, namespace)) visible = false; else @@ -2639,13 +2698,22 @@ LookupNamespaceNoError(const char *nspname) { if (OidIsValid(myTempNamespace)) return myTempNamespace; - +#ifdef XCP + /* + * Try to find temporary namespace created by other backend of + * the same distributed session. If not found myTempNamespace will + * be InvalidOid, that is correct result. + */ + FindTemporaryNamespace(); + return myTempNamespace; +#else /* * Since this is used only for looking up existing objects, there is * no point in trying to initialize the temp namespace here; and doing * so might create problems for some callers. Just report "not found". */ return InvalidOid; +#endif } return get_namespace_oid(nspname, true); @@ -2670,6 +2738,16 @@ LookupExplicitNamespace(const char *nspname) if (OidIsValid(myTempNamespace)) return myTempNamespace; +#ifdef XCP + /* + * Try to find temporary namespace created by other backend of + * the same distributed session. + */ + FindTemporaryNamespace(); + if (OidIsValid(myTempNamespace)) + return myTempNamespace; +#endif + /* * Since this is used only for looking up existing objects, there is * no point in trying to initialize the temp namespace here; and doing @@ -3068,7 +3146,16 @@ GetOverrideSearchPath(MemoryContext context) result->addTemp = true; else { +#ifdef XCP + /* + * The while loop assumes that you can only have one catalog schema + * in the namespace. Not quite.. + */ + Assert(linitial_oid(schemas) == STORM_CATALOG_NAMESPACE || + linitial_oid(schemas) == PG_CATALOG_NAMESPACE); +#else Assert(linitial_oid(schemas) == PG_CATALOG_NAMESPACE); +#endif result->addCatalog = true; } schemas = list_delete_first(schemas); @@ -3145,7 +3232,14 @@ PushOverrideSearchPath(OverrideSearchPath *newpath) * permissions for these. */ if (newpath->addCatalog) +#ifdef XCP + { + oidlist = lcons_oid(PG_CATALOG_NAMESPACE, oidlist); + oidlist = lcons_oid(STORM_CATALOG_NAMESPACE, oidlist); + } +#else oidlist = lcons_oid(PG_CATALOG_NAMESPACE, oidlist); +#endif if (newpath->addTemp && OidIsValid(myTempNamespace)) oidlist = lcons_oid(myTempNamespace, oidlist); @@ -3472,6 +3566,11 @@ recomputeNamespacePath(void) if (!list_member_oid(oidlist, PG_CATALOG_NAMESPACE)) oidlist = lcons_oid(PG_CATALOG_NAMESPACE, oidlist); +#ifdef XCP + if (!list_member_oid(oidlist, STORM_CATALOG_NAMESPACE)) + oidlist = lcons_oid(STORM_CATALOG_NAMESPACE, oidlist); +#endif + if (OidIsValid(myTempNamespace) && !list_member_oid(oidlist, myTempNamespace)) oidlist = lcons_oid(myTempNamespace, oidlist); @@ -3550,6 +3649,16 @@ InitTempTableNamespace(void) (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), errmsg("cannot create temporary tables during recovery"))); +#ifdef XCP + /* + * In case of distributed session use MyFirstBackendId for temp objects + */ + if (OidIsValid(MyCoordId)) + snprintf(namespaceName, sizeof(namespaceName), "pg_temp_%d", + MyFirstBackendId); + else + /* fallback to default */ +#endif snprintf(namespaceName, sizeof(namespaceName), "pg_temp_%d", MyBackendId); namespaceId = get_namespace_oid(namespaceName, true); @@ -3582,6 +3691,16 @@ InitTempTableNamespace(void) * it. (We assume there is no need to clean it out if it does exist, since * dropping a parent table should make its toast table go away.) */ +#ifdef XCP + /* + * In case of distributed session use MyFirstBackendId for temp objects + */ + if (OidIsValid(MyCoordId)) + snprintf(namespaceName, sizeof(namespaceName), "pg_toast_temp_%d", + MyFirstBackendId); + else + /* fallback to default */ +#endif snprintf(namespaceName, sizeof(namespaceName), "pg_toast_temp_%d", MyBackendId); @@ -3604,6 +3723,9 @@ InitTempTableNamespace(void) /* It should not be done already. */ AssertState(myTempNamespaceSubID == InvalidSubTransactionId); +#ifdef XCP + if (!OidIsValid(MyCoordId)) +#endif myTempNamespaceSubID = GetCurrentSubTransactionId(); baseSearchPathValid = false; /* need to rebuild list */ @@ -3626,7 +3748,20 @@ AtEOXact_Namespace(bool isCommit) if (myTempNamespaceSubID != InvalidSubTransactionId) { if (isCommit) +#ifdef XCP + { + /* + * During backend lifetime it may be assigned to different + * distributed sessions, and each of them may create temp + * namespace and set a callback. That may cause memory leak. + * XXX is it ever possible to remove callbacks? + */ + if (!OidIsValid(MyCoordId)) + on_shmem_exit(RemoveTempRelationsCallback, 0); + } +#else on_shmem_exit(RemoveTempRelationsCallback, 0); +#endif else { myTempNamespace = InvalidOid; @@ -3783,9 +3918,46 @@ ResetTempTableNamespace(void) { if (OidIsValid(myTempNamespace)) RemoveTempRelations(myTempNamespace); +#ifdef XCP + else if (OidIsValid(MyCoordId)) + { + char namespaceName[NAMEDATALEN]; + Oid namespaceId; + + snprintf(namespaceName, sizeof(namespaceName), "pg_temp_%d", + MyFirstBackendId); + + namespaceId = get_namespace_oid(namespaceName, true); + if (OidIsValid(namespaceId)) + RemoveTempRelations(namespaceId); + } +#endif } +#ifdef XCP +/* + * Reset myTempNamespace so it will be reinitialized after backend is assigned + * to a different session. + */ +void +ForgetTempTableNamespace(void) +{ + /* If the namespace exists and need to be cleaned up do that */ + if (OidIsValid(myTempNamespace) && + myTempNamespaceSubID != InvalidSubTransactionId) + { + elog(WARNING, "leaked temp namespace clean up callback"); + RemoveTempRelations(myTempNamespace); + } + myTempNamespace = InvalidOid; + myTempToastNamespace = InvalidOid; + baseSearchPathValid = false; /* need to rebuild list */ + myTempNamespaceSubID = InvalidSubTransactionId; +} +#endif + + /* * Routines for handling the GUC variable 'search_path'. */ @@ -4121,3 +4293,43 @@ pg_is_other_temp_schema(PG_FUNCTION_ARGS) PG_RETURN_BOOL(isOtherTempNamespace(oid)); } + + +#ifdef XCP +/* + * FindTemporaryNamespace + * If this is secondary backend of distributed session check if primary backend + * of the same session created temporary namespace and wire it up if it is the + * case, instead of creating new. + */ +static void +FindTemporaryNamespace(void) +{ + char namespaceName[NAMEDATALEN]; + + Assert(!OidIsValid(myTempNamespace)); + + /* + * We need distribution session identifier to find the namespace. + */ + if (!OidIsValid(MyCoordId)) + return; + + /* + * Look up namespace by name. This code should be in synch with + * InitTempTableNamespace. + */ + snprintf(namespaceName, sizeof(namespaceName), "pg_temp_%d", + MyFirstBackendId); + myTempNamespace = get_namespace_oid(namespaceName, true); + /* Same for the toast namespace */ + if (OidIsValid(myTempNamespace)) + { + snprintf(namespaceName, sizeof(namespaceName), "pg_toast_temp_%d", + MyFirstBackendId); + myTempToastNamespace = get_namespace_oid(namespaceName, true); + baseSearchPathValid = false; /* need to rebuild list */ + } +} +#endif + diff --git a/src/backend/catalog/pg_aggregate.c b/src/backend/catalog/pg_aggregate.c index 856adc3b0f..768732888c 100644 --- a/src/backend/catalog/pg_aggregate.c +++ b/src/backend/catalog/pg_aggregate.c @@ -3,6 +3,11 @@ * pg_aggregate.c * routines to support manipulation of the pg_aggregate relation * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -53,6 +58,9 @@ AggregateCreate(const char *aggName, List *aggfinalfnName, List *aggsortopName, Oid aggTransType, +#ifdef XCP + Oid aggCollectType, +#endif #ifdef PGXC const char *agginitval, const char *agginitcollect) @@ -172,6 +180,27 @@ AggregateCreate(const char *aggName, ReleaseSysCache(tup); #ifdef PGXC +#ifdef XCP + if (aggcollectfnName) + { + /* + * Collection function must be of two arguments + * First must be of aggCollectType, second must be of aggTransType + * Return value must be of aggCollectType + */ + fnArgs[0] = aggCollectType; + fnArgs[1] = aggTransType; + collectfn = lookup_agg_function(aggcollectfnName, 2, fnArgs, + &rettype); + if (rettype != aggCollectType) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("return type of collection function %s is not %s", + NameListToString(aggcollectfnName), + format_type_be(aggCollectType) + ))); + } +#else if (aggcollectfnName) { /* @@ -189,11 +218,16 @@ AggregateCreate(const char *aggName, NameListToString(aggcollectfnName), format_type_be(aggTransType)))); } - +#endif #endif /* handle finalfn, if supplied */ if (aggfinalfnName) { +#ifdef XCP + if (OidIsValid(aggCollectType)) + fnArgs[0] = aggCollectType; + else +#endif fnArgs[0] = aggTransType; finalfn = lookup_agg_function(aggfinalfnName, 1, fnArgs, &finaltype); @@ -203,6 +237,11 @@ AggregateCreate(const char *aggName, /* * If no finalfn, aggregate result type is type of the state value */ +#ifdef XCP + if (OidIsValid(aggCollectType)) + finaltype = aggCollectType; + else +#endif finaltype = aggTransType; } Assert(OidIsValid(finaltype)); @@ -319,6 +358,9 @@ AggregateCreate(const char *aggName, #ifdef PGXC values[Anum_pg_aggregate_aggcollectfn - 1] = ObjectIdGetDatum(collectfn); #endif +#ifdef XCP + values[Anum_pg_aggregate_aggcollecttype - 1] = ObjectIdGetDatum(aggCollectType); +#endif if (agginitval) values[Anum_pg_aggregate_agginitval - 1] = CStringGetTextDatum(agginitval); else diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c index c758f63224..6363081b1f 100644 --- a/src/backend/catalog/pg_proc.c +++ b/src/backend/catalog/pg_proc.c @@ -3,6 +3,11 @@ * pg_proc.c * routines to support manipulation of the pg_proc relation * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -39,6 +44,7 @@ #ifdef PGXC #include "pgxc/execRemote.h" #include "pgxc/pgxc.h" +#include "pgxc/planner.h" #endif @@ -903,6 +909,7 @@ fmgr_sql_validator(PG_FUNCTION_ARGS) pinfo); #ifdef PGXC +#ifndef XCP /* Check if the list of queries contains temporary objects */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { @@ -915,6 +922,7 @@ fmgr_sql_validator(PG_FUNCTION_ARGS) ExecSetTempObjectIncluded(); } #endif +#endif querytree_list = list_concat(querytree_list, querytree_sublist); diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 993bc49c2a..f9ee56d5ab 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -3,6 +3,11 @@ * storage.c * code to create and destroy physical storage for relations * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -106,6 +111,11 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence) switch (relpersistence) { case RELPERSISTENCE_TEMP: +#ifdef XCP + if (OidIsValid(MyCoordId)) + backend = MyFirstBackendId; + else +#endif backend = MyBackendId; needs_wal = false; break; diff --git a/src/backend/catalog/storm_catalog.sql b/src/backend/catalog/storm_catalog.sql new file mode 100644 index 0000000000..47776ba00a --- /dev/null +++ b/src/backend/catalog/storm_catalog.sql @@ -0,0 +1,307 @@ +CREATE VIEW storm_catalog.pg_roles AS + SELECT * + FROM pg_catalog.pg_roles + WHERE rolname = current_user + OR split_part(rolname, '@', 2) = current_database(); + +GRANT SELECT on storm_catalog.pg_roles TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_roles FROM public; + +CREATE VIEW storm_catalog.pg_shdescription AS + SELECT d.objoid, d.classoid, d.description + FROM pg_catalog.pg_shdescription d, pg_catalog.pg_class c + WHERE d.classoid = c.oid + AND c.relname = 'pg_database' + AND d.objoid = (SELECT oid FROM pg_database WHERE datname = current_database()) + UNION + SELECT d.objoid, d.classoid, d.description + FROM pg_catalog.pg_shdescription d, pg_catalog.pg_class c + WHERE d.classoid = c.oid + AND c.relname = 'pg_authid' + AND d.objoid = (SELECT oid FROM storm_catalog.pg_roles WHERE rolname = current_user); + +GRANT SELECT on storm_catalog.pg_shdescription TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_shdescription FROM public; + +CREATE VIEW storm_catalog.pg_database AS + SELECT tableoid, oid, datname, datdba, encoding, datcollate, datctype, + datistemplate, datallowconn, datconnlimit, datlastsysoid, + datfrozenxid, dattablespace, datacl + FROM pg_catalog.pg_database + WHERE datallowconn AND (has_database_privilege(datname, 'CREATE') OR + split_part(current_user, '@', 2) = datname); + +GRANT SELECT on storm_catalog.pg_database TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_database FROM public; + +CREATE VIEW storm_catalog.pg_db_role_setting AS + SELECT setdatabase, setrole, setconfig + FROM pg_catalog.pg_db_role_setting + WHERE setdatabase = (SELECT oid FROM pg_database WHERE datname = current_database()) + UNION + SELECT setdatabase, setrole, setconfig + FROM pg_db_role_setting + WHERE setrole = (SELECT oid FROM storm_catalog.pg_roles WHERE rolname = current_user); + +GRANT SELECT on storm_catalog.pg_db_role_setting TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_db_role_setting FROM public; + +CREATE VIEW storm_catalog.pg_tablespace AS + SELECT oid, spcname, spcowner, ''::text as spclocation, ''::text as spcacl, + ''::text as spcoptions FROM pg_catalog.pg_tablespace; + +GRANT SELECT on storm_catalog.pg_tablespace TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_tablespace FROM public; + +CREATE VIEW storm_catalog.pg_auth_members AS + SELECT roleid, member, grantor, admin_option + FROM pg_catalog.pg_auth_members + WHERE roleid = (SELECT oid FROM storm_catalog.pg_roles WHERE rolname = current_user) + UNION + SELECT roleid, member, grantor, admin_option + FROM pg_catalog.pg_auth_members + WHERE grantor = (SELECT oid FROM storm_catalog.pg_roles WHERE rolname = current_user); + +GRANT SELECT on storm_catalog.pg_auth_members TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_auth_members FROM public; + +CREATE VIEW storm_catalog.pg_shdepend AS + SELECT dbid, classid, objid, objsubid, refclassid, refobjid, deptype + FROM pg_catalog.pg_shdepend + WHERE dbid = (SELECT oid FROM pg_database WHERE datname = current_database()); + +GRANT SELECT on storm_catalog.pg_shdepend TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_shdepend FROM public; + +CREATE VIEW storm_catalog.pg_stat_database AS + SELECT * + FROM pg_catalog.pg_stat_database + WHERE datid = (SELECT oid FROM pg_database WHERE datname = current_database()); + +GRANT SELECT on storm_catalog.pg_stat_database TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_stat_database FROM public; + +CREATE VIEW storm_catalog.pg_stat_database_conflicts AS + SELECT * + FROM pg_catalog.pg_stat_database_conflicts + WHERE datid = (SELECT oid FROM pg_database WHERE datname = current_database()); + +GRANT SELECT on storm_catalog.pg_stat_database_conflicts TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_stat_database_conflicts FROM public; + + +CREATE VIEW storm_catalog.pg_prepared_xacts AS + SELECT * + FROM pg_catalog.pg_prepared_xacts + WHERE database = current_database(); + +GRANT SELECT on storm_catalog.pg_prepared_xacts TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_prepared_xacts FROM public; + +CREATE VIEW storm_catalog.pg_user AS + SELECT * + FROM pg_catalog.pg_user + WHERE usename = current_user + OR split_part(usename, '@', 2) = current_database(); + +GRANT SELECT on storm_catalog.pg_user TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_user FROM public; + +CREATE VIEW storm_catalog.pg_group AS + SELECT * + FROM pg_catalog.pg_group + WHERE split_part(groname, '@', 2) = current_database(); + +GRANT SELECT on storm_catalog.pg_group TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_group FROM public; + +CREATE VIEW storm_catalog.pg_shadow AS + SELECT * + FROM pg_catalog.pg_shadow + WHERE usename = current_user + OR split_part(usename, '@', 2) = current_database(); + +GRANT SELECT on storm_catalog.pg_shadow TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_shadow FROM public; + +CREATE VIEW storm_catalog.pg_user_mappings AS + SELECT * + FROM pg_catalog.pg_user_mappings + WHERE usename = current_user + OR split_part(usename, '@', 2) = current_database(); + +GRANT SELECT on storm_catalog.pg_user_mappings TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_user_mappings FROM public; + +REVOKE ALL on pg_catalog.pg_stat_bgwriter FROM public; + +REVOKE ALL on pg_catalog.pg_seclabels FROM public; + +REVOKE ALL on FUNCTION pg_catalog.pg_conf_load_time() FROM PUBLIC; + +REVOKE ALL on FUNCTION pg_catalog.pg_current_xlog_insert_location() FROM PUBLIC; + +REVOKE ALL on FUNCTION pg_catalog.pg_current_xlog_location() FROM PUBLIC; + +REVOKE ALL on FUNCTION pg_catalog.pg_is_in_recovery() FROM PUBLIC; + +REVOKE ALL on FUNCTION pg_catalog.pg_last_xlog_receive_location() FROM PUBLIC; + +REVOKE ALL on FUNCTION pg_catalog.pg_last_xlog_replay_location() FROM PUBLIC; + +REVOKE ALL on FUNCTION pg_catalog.pg_postmaster_start_time() FROM PUBLIC; + +REVOKE ALL on FUNCTION pg_catalog.pg_tablespace_databases(oid) FROM PUBLIC; + +REVOKE ALL on FUNCTION pg_catalog.pg_tablespace_size(oid) FROM PUBLIC; + +REVOKE ALL on FUNCTION pg_catalog.pg_tablespace_size(name) FROM PUBLIC; + +CREATE FUNCTION storm_catalog.pg_database_size(name) RETURNS bigint AS +$BODY$ +BEGIN + IF $1 = current_database() THEN + return pg_catalog.pg_database_size($1); + END IF; + + return 0; +END +$BODY$ +LANGUAGE 'plpgsql' ; + +GRANT EXECUTE on FUNCTION storm_catalog.pg_database_size(name) TO PUBLIC; + +REVOKE ALL on FUNCTION pg_catalog.pg_database_size(name) FROM PUBLIC; + +CREATE OR REPLACE FUNCTION storm_catalog.pg_database_size(oid) RETURNS bigint AS +$BODY$ +DECLARE + is_current_db boolean; +BEGIN + SELECT $1 = oid + INTO is_current_db + FROM pg_catalog.pg_database + WHERE datname = current_database(); + + IF is_current_db THEN + return pg_catalog.pg_database_size($1); + END IF; + + return 0; +END +$BODY$ +LANGUAGE 'plpgsql' ; + +GRANT EXECUTE on FUNCTION storm_catalog.pg_database_size(oid) TO PUBLIC; + +REVOKE ALL on FUNCTION pg_catalog.pg_database_size(oid) FROM PUBLIC; + +CREATE FUNCTION storm_catalog.pg_show_all_settings( + OUT name text, OUT setting text, OUT unit text, OUT category text, + OUT short_desc text, OUT extra_desc text, OUT context text, + OUT vartype text, OUT source text, OUT min_val text, OUT max_val text, + OUT enumvals text[], OUT boot_val text, OUT reset_val text, + OUT sourcefile text, OUT sourceline integer) +RETURNS SETOF record AS +$BODY$ +BEGIN + RETURN QUERY + SELECT * + FROM pg_catalog.pg_show_all_settings() s + WHERE s.context != 'postmaster' + AND s.context != 'sighup'; +END +$BODY$ +LANGUAGE 'plpgsql' SECURITY DEFINER; + +GRANT EXECUTE on FUNCTION storm_catalog.pg_show_all_settings() TO PUBLIC; + +REVOKE ALL on FUNCTION pg_catalog.pg_show_all_settings() FROM PUBLIC; + +CREATE VIEW storm_catalog.pg_settings AS + SELECT * + FROM pg_show_all_settings(); + +GRANT SELECT on storm_catalog.pg_settings TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_settings FROM public; + +CREATE FUNCTION storm_catalog.pg_stat_get_activity( + procpid integer, OUT datid oid, OUT pid integer, OUT usesysid oid, + OUT application_name text, OUT state text, OUT query text, + OUT waiting boolean, OUT xact_start timestamp with time zone, + OUT query_start timestamp with time zone, + OUT backend_start timestamp with time zone, + OUT state_change timestamp with time zone, + OUT client_addr inet, + OUT client_hostname text, OUT client_port integer) +RETURNS SETOF record AS +$BODY$ +BEGIN + RETURN QUERY + SELECT * + FROM pg_catalog.pg_stat_get_activity($1) s + WHERE s.datid = (SELECT oid + FROM pg_database + WHERE datname = current_database()); +END +$BODY$ +LANGUAGE 'plpgsql' SECURITY DEFINER; + +GRANT EXECUTE on FUNCTION storm_catalog.pg_stat_get_activity(integer) TO PUBLIC; + +REVOKE ALL on FUNCTION pg_catalog.pg_stat_get_activity(integer) FROM PUBLIC; + +CREATE VIEW storm_catalog.pg_stat_activity AS + SELECT * + FROM storm_catalog.pg_stat_get_activity(NULL); + +GRANT SELECT on storm_catalog.pg_stat_activity TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_stat_activity FROM public; + +CREATE FUNCTION storm_catalog.pg_lock_status( + OUT locktype text, OUT database oid, OUT relation oid, + OUT page integer, OUT tuple smallint, OUT virtualxid text, + OUT transactionid xid, OUT classid oid, OUT objid oid, + OUT objsubid smallint, OUT virtualtransaction text, + OUT pid integer, OUT mode text, OUT granted boolean, + OUT fastpath boolean) +RETURNS SETOF record AS +$BODY$ +BEGIN + RETURN QUERY + SELECT * + FROM pg_catalog.pg_lock_status() l + WHERE l.database = (SELECT oid + FROM pg_database + WHERE datname = current_database()); +END +$BODY$ +LANGUAGE 'plpgsql' SECURITY DEFINER; + +GRANT EXECUTE on FUNCTION storm_catalog.pg_lock_status() TO PUBLIC; + +REVOKE ALL on FUNCTION pg_catalog.pg_lock_status() FROM PUBLIC; + +CREATE VIEW storm_catalog.pg_locks AS + SELECT * + FROM storm_catalog.pg_lock_status(); + +GRANT SELECT on storm_catalog.pg_locks TO PUBLIC; + +REVOKE ALL on pg_catalog.pg_locks FROM public; diff --git a/src/backend/commands/aggregatecmds.c b/src/backend/commands/aggregatecmds.c index ddf029e857..9b7b05bfe4 100644 --- a/src/backend/commands/aggregatecmds.c +++ b/src/backend/commands/aggregatecmds.c @@ -4,6 +4,11 @@ * * Routines for aggregate-manipulation commands * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -56,6 +61,9 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters) List *sortoperatorName = NIL; TypeName *baseType = NULL; TypeName *transType = NULL; +#ifdef XCP + TypeName *collectType = NULL; +#endif char *initval = NULL; #ifdef PGXC List *collectfuncName = NIL; @@ -64,6 +72,9 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters) Oid *aggArgTypes; int numArgs; Oid transTypeId; +#ifdef XCP + Oid collectTypeId; +#endif ListCell *pl; /* Convert list of names to a name and namespace */ @@ -97,6 +108,10 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters) transType = defGetTypeName(defel); else if (pg_strcasecmp(defel->defname, "stype1") == 0) transType = defGetTypeName(defel); +#ifdef XCP + else if (pg_strcasecmp(defel->defname, "ctype") == 0) + collectType = defGetTypeName(defel); +#endif else if (pg_strcasecmp(defel->defname, "initcond") == 0) initval = defGetString(defel); else if (pg_strcasecmp(defel->defname, "initcond1") == 0) @@ -126,6 +141,17 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters) (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), errmsg("aggregate sfunc must be specified"))); +#ifdef XCP + if (collectfuncName && collectType == NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("if aggregate cfunc is defined aggregate ctype must be specified"))); + if (collectType && collectfuncName == NIL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("if aggregate ctype is defined aggregate cfunc must be specified"))); +#endif + /* * look up the aggregate's input datatype(s). */ @@ -202,6 +228,31 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters) format_type_be(transTypeId)))); } +#ifdef XCP + /* + * look up the aggregate's collecttype. + * + * to the collecttype applied all the limitations as to the transtype. + */ + if (collectType) + { + collectTypeId = typenameTypeId(NULL, collectType); + if (get_typtype(collectTypeId) == TYPTYPE_PSEUDO && + !IsPolymorphicType(collectTypeId)) + { + if (collectTypeId == INTERNALOID && superuser()) + /* okay */ ; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("aggregate collection data type cannot be %s", + format_type_be(collectTypeId)))); + } + } + else + collectTypeId = InvalidOid; +#endif + /* * Most of the argument-checking is done inside of AggregateCreate */ @@ -216,6 +267,9 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters) finalfuncName, /* final function name */ sortoperatorName, /* sort operator name */ transTypeId, /* transition data type */ +#ifdef XCP + collectTypeId, /* collection data type */ +#endif #ifdef PGXC initval, /* initial condition */ initcollect); /* initial condition for collection function */ diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 9612a276f3..0a88c4ea4d 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -3,6 +3,11 @@ * analyze.c * the Postgres statistics generator * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -53,6 +58,14 @@ #include "utils/timestamp.h" #include "utils/tqual.h" +#ifdef XCP +#include "catalog/pg_operator.h" +#include "nodes/makefuncs.h" +#include "pgxc/execRemote.h" +#include "pgxc/pgxc.h" +#include "pgxc/planner.h" +#include "utils/snapmgr.h" +#endif /* Data structure for Algorithm S from Knuth 3.4.2 */ typedef struct @@ -108,6 +121,10 @@ static void update_attstats(Oid relid, bool inh, static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); +#ifdef XCP +static void analyze_rel_coordinator(Relation onerel, bool inh, int attr_cnt, + VacAttrStats **vacattrstats); +#endif /* * analyze_rel() -- analyze one relation @@ -404,6 +421,31 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, attr_cnt = tcnt; } +#ifdef XCP + if (IS_PGXC_COORDINATOR && onerel->rd_locator_info) + { + /* + * Fetch attribute statistics from remote nodes. + */ + analyze_rel_coordinator(onerel, inh, attr_cnt, vacattrstats); + /* + * If it is a VACUUM or doing inherited relation precise values for + * relpages and reltuples are set in other place. Otherwise request + * doing it now. + */ + if (!inh && !(vacstmt->options & VACOPT_VACUUM)) + vacuum_rel_coordinator(onerel); + /* + * Skip acquiring local stats. Coordinator does not store data of + * distributed tables. + */ + nindexes = 0; + hasindex = false; + Irel = NULL; + goto cleanup; + } +#endif + /* * Open all indexes of the relation, and see if there are any analyzable * columns in the indexes. We do not analyze index columns if there was @@ -604,6 +646,12 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, } } +#ifdef XCP + /* + * Coordinator skips getting local stats of distributed table up to here + */ +cleanup: +#endif /* * Report ANALYZE to the stats collector, too. However, if doing * inherited stats we shouldn't report, because the stats collector only @@ -2790,3 +2838,423 @@ compare_mcvs(const void *a, const void *b) return da - db; } + + +#ifdef XCP +static void +analyze_rel_coordinator(Relation onerel, bool inh, int attr_cnt, + VacAttrStats **vacattrstats) +{ + char *nspname; + char *relname; + /* Fields to run query to read statistics from data nodes */ + StringInfoData query; + EState *estate; + MemoryContext oldcontext; + RemoteQuery *step; + RemoteQueryState *node; + TupleTableSlot *result; + int i; + /* Number of data nodes from which attribute statistics are received. */ + int *numnodes; + + /* Get the relation identifier */ + relname = RelationGetRelationName(onerel); + nspname = get_namespace_name(RelationGetNamespace(onerel)); + + elog(LOG, "Getting detailed statistics for %s.%s", nspname, relname); + + /* Make up query string */ + initStringInfo(&query); + /* Generic statistic fields */ + appendStringInfoString(&query, "SELECT s.staattnum, " +// assume the number of tuples approximately the same on all nodes +// to build more precise statistics get this number +// "c.reltuples, " + "s.stanullfrac, " + "s.stawidth, " + "s.stadistinct"); + /* Detailed statistic slots */ + for (i = 1; i <= STATISTIC_NUM_SLOTS; i++) + appendStringInfo(&query, ", s.stakind%d" + ", o%d.oprname" + ", no%d.nspname" + ", t%dl.typname" + ", nt%dl.nspname" + ", t%dr.typname" + ", nt%dr.nspname" + ", s.stanumbers%d" + ", s.stavalues%d", + i, i, i, i, i, i, i, i, i); + + /* Common part of FROM clause */ + appendStringInfoString(&query, " FROM pg_statistic s JOIN pg_class c " + " ON s.starelid = c.oid " + "JOIN pg_namespace nc " + " ON c.relnamespace = nc.oid "); + /* Info about involved operations */ + for (i = 1; i <= STATISTIC_NUM_SLOTS; i++) + appendStringInfo(&query, "LEFT JOIN (pg_operator o%d " + " JOIN pg_namespace no%d " + " ON o%d.oprnamespace = no%d.oid " + " JOIN pg_type t%dl " + " ON o%d.oprleft = t%dl.oid " + " JOIN pg_namespace nt%dl " + " ON t%dl.typnamespace = nt%dl.oid " + " JOIN pg_type t%dr " + " ON o%d.oprright = t%dr.oid " + " JOIN pg_namespace nt%dr " + " ON t%dr.typnamespace = nt%dr.oid) " + " ON s.staop%d = o%d.oid ", + i, i, i, i, i, i, i, i, i, + i, i, i, i, i, i, i, i, i); + appendStringInfo(&query, "WHERE nc.nspname = '%s' " + "AND c.relname = '%s'", + nspname, relname); + + /* Build up RemoteQuery */ + step = makeNode(RemoteQuery); + step->combine_type = COMBINE_TYPE_NONE; + step->exec_nodes = NULL; + step->sql_statement = query.data; + step->force_autocommit = true; + step->exec_type = EXEC_ON_DATANODES; + + /* Add targetlist entries */ + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, + "pg_statistic", + "staattnum")); +// step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, +// make_relation_tle(RelationRelationId, +// "pg_class", +// "reltuples")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, + "pg_statistic", + "stanullfrac")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, + "pg_statistic", + "stawidth")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, + "pg_statistic", + "stadistinct")); + for (i = 1; i <= STATISTIC_NUM_SLOTS; i++) + { + /* 16 characters would be enough */ + char colname[16]; + + sprintf(colname, "stakind%d", i); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, + "pg_statistic", + colname)); + + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(OperatorRelationId, + "pg_operator", + "oprname")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(NamespaceRelationId, + "pg_namespace", + "nspname")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(TypeRelationId, + "pg_type", + "typname")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(NamespaceRelationId, + "pg_namespace", + "nspname")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(TypeRelationId, + "pg_type", + "typname")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(NamespaceRelationId, + "pg_namespace", + "nspname")); + + sprintf(colname, "stanumbers%d", i); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, + "pg_statistic", + colname)); + + sprintf(colname, "stavalues%d", i); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(StatisticRelationId, + "pg_statistic", + colname)); + } + /* Execute query on the data nodes */ + estate = CreateExecutorState(); + + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + estate->es_snapshot = GetActiveSnapshot(); + + node = ExecInitRemoteQuery(step, estate, 0); + MemoryContextSwitchTo(oldcontext); + + /* get ready to combine results */ + numnodes = (int *) palloc(attr_cnt * sizeof(int)); + for (i = 0; i < attr_cnt; i++) + numnodes[i] = 0; + + result = ExecRemoteQuery(node); + while (result != NULL && !TupIsNull(result)) + { + Datum value; + bool isnull; + int colnum = 1; + int2 attnum; +// float4 reltuples; + float4 nullfrac; + int4 width; + float4 distinct; + VacAttrStats *stats = NULL; + + + /* Process statistics from the data node */ + value = slot_getattr(result, colnum++, &isnull); /* staattnum */ + attnum = DatumGetInt16(value); + for (i = 0; i < attr_cnt; i++) + if (vacattrstats[i]->attr->attnum == attnum) + { + stats = vacattrstats[i]; + stats->stats_valid = true; + numnodes[i]++; + break; + } + +// value = slot_getattr(result, colnum++, &isnull); /* reltuples */ +// reltuples = DatumGetFloat4(value); + + if (stats) + { + value = slot_getattr(result, colnum++, &isnull); /* stanullfrac */ + nullfrac = DatumGetFloat4(value); + stats->stanullfrac += nullfrac; + + value = slot_getattr(result, colnum++, &isnull); /* stawidth */ + width = DatumGetInt32(value); + stats->stawidth += width; + + value = slot_getattr(result, colnum++, &isnull); /* stadistinct */ + distinct = DatumGetFloat4(value); + stats->stadistinct += distinct; + + /* Detailed statistics */ + for (i = 1; i <= STATISTIC_NUM_SLOTS; i++) + { + int2 kind; + float4 *numbers; + Datum *values; + int nnumbers, nvalues; + int k; + + value = slot_getattr(result, colnum++, &isnull); /* kind */ + kind = DatumGetInt16(value); + + if (kind == 0) + { + /* + * Empty slot - skip next 8 fields: 6 fields of the + * operation identifier and two data fields (numbers and + * values) + */ + colnum += 8; + continue; + } + else + { + Oid oprid; + + /* Get operator */ + value = slot_getattr(result, colnum++, &isnull); /* oprname */ + if (isnull) + { + /* + * Operator is not specified for that kind, skip remaining + * fields to lookup the operator + */ + oprid = InvalidOid; + colnum += 5; /* skip operation nsp and types */ + } + else + { + char *oprname; + char *oprnspname; + Oid ltypid, rtypid; + char *ltypname, + *rtypname; + char *ltypnspname, + *rtypnspname; + oprname = DatumGetCString(value); + value = slot_getattr(result, colnum++, &isnull); /* oprnspname */ + oprnspname = DatumGetCString(value); + /* Get left operand data type */ + value = slot_getattr(result, colnum++, &isnull); /* typname */ + ltypname = DatumGetCString(value); + value = slot_getattr(result, colnum++, &isnull); /* typnspname */ + ltypnspname = DatumGetCString(value); + ltypid = get_typname_typid(ltypname, + get_namespaceid(ltypnspname)); + /* Get right operand data type */ + value = slot_getattr(result, colnum++, &isnull); /* typname */ + rtypname = DatumGetCString(value); + value = slot_getattr(result, colnum++, &isnull); /* typnspname */ + rtypnspname = DatumGetCString(value); + rtypid = get_typname_typid(rtypname, + get_namespaceid(rtypnspname)); + /* lookup operator */ + oprid = get_operid(oprname, ltypid, rtypid, + get_namespaceid(oprnspname)); + } + /* + * Look up a statistics slot. If there is an entry of the + * same kind already, leave it, assuming the statistics + * is approximately the same on all nodes, so values from + * one node are representing entire relation well. + * If empty slot is found store values here. If no more + * slots skip remaining values. + */ + for (k = 0; k < STATISTIC_NUM_SLOTS; k++) + { + if (stats->stakind[k] == 0 || + (stats->stakind[k] == kind && stats->staop[k] == oprid)) + break; + } + + if (k >= STATISTIC_NUM_SLOTS) + { + /* No empty slots */ + break; + } + + /* + * If it is an existing slot which has numbers or values + * continue to the next set. If slot exists but without + * numbers and values, try to acquire them now + */ + if (stats->stakind[k] != 0 && (stats->numnumbers[k] > 0 || + stats->numvalues[k] > 0)) + { + colnum += 2; /* skip numbers and values */ + continue; + } + + /* + * Initialize slot + */ + stats->stakind[k] = kind; + stats->staop[k] = oprid; + stats->numnumbers[k] = 0; + stats->stanumbers[k] = NULL; + stats->numvalues[k] = 0; + stats->stavalues[k] = NULL; + stats->statypid[k] = InvalidOid; + stats->statyplen[k] = -1; + stats->statypalign[k] = 'i'; + stats->statypbyval[k] = true; + } + + + /* get numbers */ + value = slot_getattr(result, colnum++, &isnull); /* numbers */ + if (!isnull) + { + ArrayType *arry = DatumGetArrayTypeP(value); + + /* + * We expect the array to be a 1-D float4 array; verify that. We don't + * need to use deconstruct_array() since the array data is just going + * to look like a C array of float4 values. + */ + nnumbers = ARR_DIMS(arry)[0]; + if (ARR_NDIM(arry) != 1 || nnumbers <= 0 || + ARR_HASNULL(arry) || + ARR_ELEMTYPE(arry) != FLOAT4OID) + elog(ERROR, "stanumbers is not a 1-D float4 array"); + numbers = (float4 *) palloc(nnumbers * sizeof(float4)); + memcpy(numbers, ARR_DATA_PTR(arry), + nnumbers * sizeof(float4)); + + /* + * Free arry if it's a detoasted copy. + */ + if ((Pointer) arry != DatumGetPointer(value)) + pfree(arry); + + stats->numnumbers[k] = nnumbers; + stats->stanumbers[k] = numbers; + } + /* get values */ + value = slot_getattr(result, colnum++, &isnull); /* values */ + if (!isnull) + { + int j; + ArrayType *arry; + int16 elmlen; + bool elmbyval; + char elmalign; + arry = DatumGetArrayTypeP(value); + /* We could cache this data, but not clear it's worth it */ + get_typlenbyvalalign(ARR_ELEMTYPE(arry), + &elmlen, &elmbyval, &elmalign); + /* Deconstruct array into Datum elements; NULLs not expected */ + deconstruct_array(arry, + ARR_ELEMTYPE(arry), + elmlen, elmbyval, elmalign, + &values, NULL, &nvalues); + + /* + * If the element type is pass-by-reference, we now have a bunch of + * Datums that are pointers into the syscache value. Copy them to + * avoid problems if syscache decides to drop the entry. + */ + if (!elmbyval) + { + for (j = 0; j < nvalues; j++) + values[j] = datumCopy(values[j], elmbyval, elmlen); + } + + /* + * Free statarray if it's a detoasted copy. + */ + if ((Pointer) arry != DatumGetPointer(value)) + pfree(arry); + + stats->numvalues[k] = nvalues; + stats->stavalues[k] = values; + /* store details about values data type */ + stats->statypid[k] = ARR_ELEMTYPE(arry); + stats->statyplen[k] = elmlen; + stats->statypalign[k] = elmalign; + stats->statypbyval[k] = elmbyval; + } + } + } + + /* fetch next */ + result = ExecRemoteQuery(node); + } + ExecEndRemoteQuery(node); + + for (i = 0; i < attr_cnt; i++) + { + VacAttrStats *stats = vacattrstats[i]; + + if (numnodes[i] > 0) + { + stats->stanullfrac /= numnodes[i]; + stats->stawidth /= numnodes[i]; + stats->stadistinct /= numnodes[i]; + } + } + update_attstats(RelationGetRelid(onerel), inh, attr_cnt, vacattrstats); +} +#endif diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 0944131313..eee79ce74a 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -3,6 +3,11 @@ * copy.c * Implements the COPY utility command * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -25,6 +30,10 @@ #include "access/xact.h" #include "catalog/namespace.h" #include "catalog/pg_type.h" +#ifdef XCP +#include "catalog/dependency.h" +#include "commands/sequence.h" +#endif #include "commands/copy.h" #include "commands/defrem.h" #include "commands/trigger.h" @@ -37,13 +46,13 @@ #include "optimizer/planner.h" #include "parser/parse_relation.h" #ifdef PGXC -#include "optimizer/pgxcship.h" #include "pgxc/pgxc.h" #include "pgxc/execRemote.h" #include "pgxc/locator.h" #include "pgxc/remotecopy.h" #include "nodes/nodes.h" #include "pgxc/poolmgr.h" +#include "pgxc/postgresql_fdw.h" #include "catalog/pgxc_node.h" #endif #include "rewrite/rewriteHandler.h" @@ -783,6 +792,9 @@ DoCopy(const CopyStmt *stmt, const char *queryString) bool pipe = (stmt->filename == NULL); Relation rel; uint64 processed; +#ifdef XCP + int oldSeqRangeVal = SequenceRangeVal; +#endif /* Disallow file COPY except to superusers. */ if (!pipe && !superuser()) @@ -813,10 +825,12 @@ DoCopy(const CopyStmt *stmt, const char *queryString) rte->requiredPerms = required_access; #ifdef PGXC +#ifndef XCP /* In case COPY is used on a temporary table, never use 2PC for implicit commits */ if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP) ExecSetTempObjectIncluded(); #endif +#endif tupDesc = RelationGetDescr(rel); attnums = CopyGetAttnums(tupDesc, rel, stmt->attlist); @@ -839,6 +853,26 @@ DoCopy(const CopyStmt *stmt, const char *queryString) rel = NULL; } +#ifdef XCP + /* + * The COPY might involve sequences. We want to cache a range of + * sequence values to avoid contacting the GTM repeatedly. This + * improves the COPY performance by quite a margin. We set the + * SequenceRangeVal GUC parameter to bring about this effect. + * Note that we could have checked the attribute list to ascertain + * if this GUC is really needed or not. However since this GUC + * only affects nextval calculations, if sequences are not present + * no harm is done.. + * + * The user might have set the GUC value himself. Honor that if so + */ + +#define MAX_CACHEVAL 1024 + if (rel && getOwnedSequences(RelationGetRelid(rel)) != NIL && + SequenceRangeVal == DEFAULT_CACHEVAL) + SequenceRangeVal = MAX_CACHEVAL; +#endif + if (is_from) { Assert(rel); @@ -850,6 +884,15 @@ DoCopy(const CopyStmt *stmt, const char *queryString) cstate = BeginCopyFrom(rel, stmt->filename, stmt->attlist, stmt->options); processed = CopyFrom(cstate); /* copy from file to database */ +#ifdef XCP + /* + * We should record insert to distributed table. + * Bulk inserts into local tables are recorded when heap tuples are + * written. + */ + if (IS_PGXC_COORDINATOR && rel->rd_locator_info) + pgstat_count_remote_insert(rel, (int) processed); +#endif EndCopyFrom(cstate); } else @@ -860,6 +903,11 @@ DoCopy(const CopyStmt *stmt, const char *queryString) EndCopyTo(cstate); } +#ifdef XCP + /* Set the SequenceRangeVal GUC to its earlier value */ + SequenceRangeVal = oldSeqRangeVal; +#endif + /* * Close the relation. If reading, we can release the AccessShareLock we * got; if writing, we should hold the lock until end of transaction to @@ -1418,10 +1466,15 @@ BeginCopy(bool is_from, */ if (remoteCopyState && remoteCopyState->rel_loc) { +#ifdef XCP + DataNodeCopyBegin(remoteCopyState); + if (!remoteCopyState->locator) +#else remoteCopyState->connections = DataNodeCopyBegin(remoteCopyState->query_buf.data, remoteCopyState->exec_nodes->nodeList, GetActiveSnapshot()); if (!remoteCopyState->connections) +#endif ereport(ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Failed to initialize Datanodes for COPY"))); @@ -1711,7 +1764,13 @@ CopyTo(CopyState cstate) cstate->remoteCopyState && cstate->remoteCopyState->rel_loc) { - RemoteCopyData *remoteCopyState = cstate->remoteCopyState; + RemoteCopyData *rcstate = cstate->remoteCopyState; +#ifdef XCP + processed = DataNodeCopyOut( + (PGXCNodeHandle **) getLocatorNodeMap(rcstate->locator), + getLocatorNodeCount(rcstate->locator), + cstate->copy_dest == COPY_FILE ? cstate->copy_file : NULL); +#else RemoteCopyType remoteCopyType; /* Set up remote COPY to correct operation */ @@ -1732,6 +1791,7 @@ CopyTo(CopyState cstate) cstate->copy_file, NULL, remoteCopyType); +#endif } else { @@ -2193,7 +2253,28 @@ CopyFrom(CopyState cstate) */ if (IS_PGXC_COORDINATOR && cstate->remoteCopyState->rel_loc) { - Form_pg_attribute *attr = tupDesc->attrs; +#ifdef XCP + Datum value = (Datum) 0; + bool isnull = true; + RemoteCopyData *rcstate = cstate->remoteCopyState; + AttrNumber dist_col = rcstate->rel_loc->partAttrNum; + + if (AttributeNumberIsValid(dist_col)) + { + value = values[dist_col-1]; + isnull = nulls[dist_col-1]; + } + + if (DataNodeCopyIn(cstate->line_buf.data, + cstate->line_buf.len, + GET_NODES(rcstate->locator, value, isnull, NULL), + (PGXCNodeHandle**) getLocatorResults(rcstate->locator))) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("Copy failed on a data node"))); + processed++; +#else + Form_pg_attribute *attr = tupDesc->attrs; Datum dist_col_value; bool dist_col_is_null; Oid dist_col_type; @@ -2225,11 +2306,11 @@ CopyFrom(CopyState cstate) (errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Copy failed on a Datanode"))); processed++; +#endif } else { #endif - /* And now we can form the input tuple. */ tuple = heap_form_tuple(tupDesc, values, nulls); @@ -2321,6 +2402,25 @@ CopyFrom(CopyState cstate) resultRelInfo, myslot, bistate, nBufferedTuples, bufferedTuples); +#ifdef XCP + /* + * Now if line buffer contains some data that is an EOF marker. We should + * send it to all the participating datanodes + */ + if (cstate->line_buf.len > 0) + { + RemoteCopyData *rcstate = cstate->remoteCopyState; + if (DataNodeCopyIn(cstate->line_buf.data, + cstate->line_buf.len, + getLocatorNodeCount(rcstate->locator), + (PGXCNodeHandle **) getLocatorNodeMap(rcstate->locator))) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("Copy failed on a data node"))); + + } +#endif + /* Done, clean up */ error_context_stack = errcontext.previous; @@ -2658,8 +2758,14 @@ BeginCopyFrom(Relation rel, tmp = htonl(tmp); appendBinaryStringInfo(&cstate->line_buf, (char *) &tmp, 4); +#ifdef XCP + if (DataNodeCopyInBinaryForAll(cstate->line_buf.data, 19, + getLocatorNodeCount(remoteCopyState->locator), + (PGXCNodeHandle **) getLocatorNodeMap(remoteCopyState->locator))) +#else if (DataNodeCopyInBinaryForAll(cstate->line_buf.data, 19, remoteCopyState->connections)) - ereport(ERROR, +#endif + ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), errmsg("invalid COPY file header (COPY SEND)"))); } @@ -3105,11 +3211,16 @@ EndCopyFrom(CopyState cstate) /* For PGXC related COPY, free also relation location data */ if (IS_PGXC_COORDINATOR && remoteCopyState->rel_loc) { +#ifdef XCP + DataNodeCopyFinish(getLocatorNodeCount(remoteCopyState->locator), + (PGXCNodeHandle **) getLocatorNodeMap(remoteCopyState->locator)); +#else bool replicated = remoteCopyState->rel_loc->locatorType == LOCATOR_TYPE_REPLICATED; DataNodeCopyFinish( remoteCopyState->connections, replicated ? PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE) : -1, replicated ? COMBINE_TYPE_SAME : COMBINE_TYPE_SUM); +#endif FreeRemoteCopyData(remoteCopyState); } #endif diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 05240d581a..8759898686 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -91,10 +91,11 @@ static bool have_createdb_privilege(void); static void remove_dbtablespaces(Oid db_id); static bool check_db_file_conflict(Oid db_id); static int errdetail_busy_db(int notherbackends, int npreparedxacts); +#ifdef PGXC static void createdb_xact_callback(bool isCommit, void *arg); static void movedb_xact_callback(bool isCommit, void *arg); static void movedb_success_callback(Oid db_id, Oid tblspcoid); - +#endif /* * CREATE DATABASE diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index a464090002..8c11d476f0 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -3,6 +3,11 @@ * explain.c * Explain query execution plans * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994-5, Regents of the University of California * @@ -779,6 +784,11 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_ForeignScan: pname = sname = "Foreign Scan"; break; +#ifdef XCP + case T_RemoteSubplan: + pname = sname = "Remote Subquery Scan"; + break; +#endif /* XCP */ case T_Material: pname = sname = "Materialize"; break; @@ -809,6 +819,21 @@ ExplainNode(PlanState *planstate, List *ancestors, strategy = "???"; break; } +#ifdef XCP + switch (((Agg *) plan)->aggstrategy) + { + case AGG_SLAVE: + operation = "Transition"; + break; + case AGG_MASTER: + operation = "Collection"; + break; + default: + operation = NULL; + break; + } +#endif + break; case T_WindowAgg: pname = sname = "WindowAgg"; @@ -902,6 +927,66 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainScanTarget((Scan *) plan, es); break; #endif +#ifdef XCP + case T_RemoteSubplan: + { + RemoteSubplan *rsubplan = (RemoteSubplan *) plan; + List *nodeNameList = NIL; + ListCell *lc; + + foreach(lc, rsubplan->nodeList) + { + char *nodename = get_pgxc_nodename( + PGXCNodeGetNodeOid(lfirst_int(lc), + PGXC_NODE_DATANODE)); + nodeNameList = lappend(nodeNameList, nodename); + } + + /* print out destination nodes */ + if (es->format == EXPLAIN_FORMAT_TEXT) + { + if (nodeNameList) + { + if (es->nodes) + { + bool first = true; + ListCell *lc; + foreach(lc, nodeNameList) + { + char *nodename = (char *) lfirst(lc); + if (first) + { + appendStringInfo(es->str, " on %s (%s", + rsubplan->execOnAll ? "all" : "any", + nodename); + first = false; + } + else + appendStringInfo(es->str, ",%s", nodename); + } + appendStringInfoChar(es->str, ')'); + } + else + { + appendStringInfo(es->str, " on %s", + rsubplan->execOnAll ? "all" : "any"); + } + } + else + { + appendStringInfo(es->str, " on local node"); + } + } + else + { + ExplainPropertyText("Replicated", + rsubplan->execOnAll ? "no" : "yes", + es); + ExplainPropertyList("Node List", nodeNameList, es); + } + } + break; +#endif /* XCP */ case T_IndexScan: { IndexScan *indexscan = (IndexScan *) plan; @@ -1130,6 +1215,7 @@ ExplainNode(PlanState *planstate, List *ancestors, "Index Cond", planstate, ancestors, es); break; #ifdef PGXC +#ifndef XCP case T_ModifyTable: { /* Remote query planning on DMLs */ @@ -1139,12 +1225,48 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainRemoteQuery((RemoteQuery *) lfirst(elt), planstate, ancestors, es); } break; +#endif case T_RemoteQuery: /* Remote query */ ExplainRemoteQuery((RemoteQuery *)plan, planstate, ancestors, es); show_scan_qual(plan->qual, "Coordinator quals", planstate, ancestors, es); break; #endif +#ifdef XCP + case T_RemoteSubplan: + { + RemoteSubplan *rsubplan = (RemoteSubplan *) plan; + + /* print out destination nodes */ + if (es->format == EXPLAIN_FORMAT_TEXT) + { + if (list_length(rsubplan->distributionNodes) > 0) + { + char label[24]; + AttrNumber dkey = rsubplan->distributionKey; + sprintf(label, "Distribute results by %c", + rsubplan->distributionType); + if (dkey == InvalidAttrNumber) + { + appendStringInfoSpaces(es->str, es->indent * 2); + appendStringInfo(es->str, "%s\n", label); + } + else + { + TargetEntry *tle = NULL; + if (plan->targetlist) + tle = (TargetEntry *) list_nth(plan->targetlist, + dkey-1); + if (IsA(tle, TargetEntry)) + show_expression((Node *) tle->expr, label, + planstate, ancestors, + false, es); + } + } + } + } + break; +#endif case T_BitmapHeapScan: show_scan_qual(((BitmapHeapScan *) plan)->bitmapqualorig, "Recheck Cond", planstate, ancestors, es); @@ -1922,7 +2044,7 @@ ExplainTargetRel(Plan *plan, Index rti, ExplainState *es) case T_RemoteQuery: /* get the object name from RTE itself */ Assert(rte->rtekind == RTE_REMOTE_DUMMY); - objectname = rte->relname; + objectname = get_rel_name(rte->relid); objecttag = "RemoteQuery name"; break; default: diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 0628e20422..261c9705c6 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -3,6 +3,11 @@ * indexcmds.c * POSTGRES define and remove index code. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -38,7 +43,6 @@ #include "parser/parse_func.h" #include "parser/parse_oper.h" #ifdef PGXC -#include "optimizer/pgxcship.h" #include "parser/parse_utilcmd.h" #include "pgxc/pgxc.h" #endif @@ -545,6 +549,56 @@ DefineIndex(RangeVar *heapRelation, (void) index_reloptions(amoptions, reloptions, true); +#ifdef PGXC + /* Make sure we can locally enforce the index */ + if (IS_PGXC_COORDINATOR && (primary || unique)) + { + ListCell *elem; + bool isSafe = false; + + foreach(elem, attributeList) + { + IndexElem *key = (IndexElem *) lfirst(elem); + +#ifdef XCP + if (rel->rd_locator_info == NULL) + { + isSafe = true; + break; + } +#endif + + if (CheckLocalIndexColumn(rel->rd_locator_info->locatorType, + rel->rd_locator_info->partAttrName, key->name)) + { + isSafe = true; + break; + } + } + if (!isSafe) +#ifdef XCP + { + if (loose_constraints) + { + ereport(WARNING, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("Unique index of partitioned table must contain the hash/modulo distribution column."))); + /* create index still, just that it won't be unique */ + unique = false; + isconstraint = false; + } + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("Unique index of partitioned table must contain the hash/modulo distribution column."))); + } +#else + ereport(ERROR, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("Unique index of partitioned table must contain the hash/modulo distribution column."))); +#endif + } +#endif /* * Prepare arguments for index_create, primarily an IndexInfo structure. * Note that ii_Predicate must be in implicit-AND format. @@ -575,37 +629,6 @@ DefineIndex(RangeVar *heapRelation, accessMethodName, accessMethodId, amcanorder, isconstraint); -#ifdef PGXC - /* Check if index is safely shippable */ - if (IS_PGXC_COORDINATOR) - { - List *indexAttrs = NIL; - - /* Prepare call for shippability evaluation */ - for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++) - { - /* - * Expression attributes are set at 0, and do not make sense - * when comparing them to distribution columns, so bypass. - */ - if (indexInfo->ii_KeyAttrNumbers[i] > 0) - indexAttrs = lappend_int(indexAttrs, indexInfo->ii_KeyAttrNumbers[i]); - } - - /* Finalize check */ - if (!pgxc_check_index_shippability(GetRelationLocInfo(relationId), - primary, - unique, - exclusionOpNames != NULL, - indexAttrs, - indexInfo->ii_Expressions)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("Cannot create index whose evaluation cannot be " - "enforced to remote nodes"))); -} -#endif - /* * Extra checks when creating a PRIMARY KEY index. */ diff --git a/src/backend/commands/portalcmds.c b/src/backend/commands/portalcmds.c index 20476c3bf1..ad3a96b541 100644 --- a/src/backend/commands/portalcmds.c +++ b/src/backend/commands/portalcmds.c @@ -9,6 +9,11 @@ * storage management for portals (but doesn't run any queries in them). * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -269,6 +274,98 @@ PortalCleanup(Portal portal) queryDesc = PortalGetQueryDesc(portal); if (queryDesc) { +#ifdef XCP + if (portal->strategy == PORTAL_DISTRIBUTED) + { + /* If portal is producing it has an executor which should be + * shut down */ + if (queryDesc->myindex == -1) + { + if (portal->status == PORTAL_FAILED) + { + /* + * Failed portal is not producing, we may remove it from the + * producers list. + */ + removeProducingPortal(portal); + /* If cleanup fails below prevent double cleanup */ + portal->queryDesc = NULL; + /* + * Inform consumers about failed producer if they are + * still waiting + */ + if (queryDesc->squeue) + SharedQueueReset(queryDesc->squeue, -1); + } + /* executor may be finished already, if so estate will be null */ + if (queryDesc->estate) + { + ResourceOwner saveResourceOwner; + + /* We must make the portal's resource owner current to + * release resources properly */ + saveResourceOwner = CurrentResourceOwner; + PG_TRY(); + { + CurrentResourceOwner = portal->resowner; + /* Finish executor if it is not yet finished */ + if (!queryDesc->estate->es_finished) + ExecutorFinish(queryDesc); + /* Destroy executor if not yet destroyed */ + if (queryDesc->estate) + ExecutorEnd(queryDesc); + if (portal->status == PORTAL_FAILED) + { + /* + * If portal if failed we can allow to be blocked + * here while UnBind is waiting for finishing + * consumers. + */ + if (queryDesc->squeue) + SharedQueueUnBind(queryDesc->squeue); + FreeQueryDesc(queryDesc); + } + } + PG_CATCH(); + { + /* Ensure CurrentResourceOwner is restored on error */ + CurrentResourceOwner = saveResourceOwner; + PG_RE_THROW(); + } + PG_END_TRY(); + CurrentResourceOwner = saveResourceOwner; + } + } + else + { + /* Cleaning up consumer */ + ResourceOwner saveResourceOwner; + + /* We must make the portal's resource owner current */ + saveResourceOwner = CurrentResourceOwner; + PG_TRY(); + { + CurrentResourceOwner = portal->resowner; + /* Prevent double cleanup in case of error below */ + portal->queryDesc = NULL; + /* Reset the squeue if exists */ + if (queryDesc->squeue) + SharedQueueReset(queryDesc->squeue, queryDesc->myindex); + FreeQueryDesc(queryDesc); + } + PG_CATCH(); + { + /* Ensure CurrentResourceOwner is restored on error */ + CurrentResourceOwner = saveResourceOwner; + PG_RE_THROW(); + } + PG_END_TRY(); + CurrentResourceOwner = saveResourceOwner; + } + } + else + { +#endif /* * Reset the queryDesc before anything else. This prevents us from * trying to shut down the executor twice, in case of an error below. @@ -299,6 +396,9 @@ PortalCleanup(Portal portal) PG_END_TRY(); CurrentResourceOwner = saveResourceOwner; } +#ifdef XCP + } +#endif } } diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index 41b00ba1f0..219608b571 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -503,7 +503,7 @@ SetRemoteStatementName(Plan *plan, const char *stmt_name, int num_params, char name[NAMEDATALEN]; /* Nothing to do if parameters are already set for this query */ - if (remotequery->rq_num_params != 0) + if (remotequery->remote_num_params != 0) return 0; if (stmt_name) @@ -546,8 +546,8 @@ SetRemoteStatementName(Plan *plan, const char *stmt_name, int num_params, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Passing parameters in PREPARE statement is not supported"))); - remotequery->rq_num_params = num_params; - remotequery->rq_param_types = param_types; + remotequery->remote_num_params = num_params; + remotequery->remote_param_types = param_types; } else if (IsA(plan, ModifyTable)) { diff --git a/src/backend/commands/schemacmds.c b/src/backend/commands/schemacmds.c index 6cc7cee3cf..ea00e171c1 100644 --- a/src/backend/commands/schemacmds.c +++ b/src/backend/commands/schemacmds.c @@ -3,6 +3,11 @@ * schemacmds.c * schema creation/manipulation commands * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -32,7 +37,7 @@ #ifdef PGXC #include "pgxc/pgxc.h" -#include "optimizer/pgxcplan.h" +#include "pgxc/planner.h" #endif static void AlterSchemaOwner_internal(HeapTuple tup, Relation rel, Oid newOwnerId); @@ -132,9 +137,14 @@ CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString) * if not done already. */ if (!sentToRemote) +#ifdef XCP + parsetree_list = AddRemoteQueryNode(parsetree_list, queryString, + EXEC_ON_ALL_NODES); +#else parsetree_list = AddRemoteQueryNode(parsetree_list, queryString, EXEC_ON_ALL_NODES, false); #endif +#endif /* * Execute each command contained in the CREATE SCHEMA. Since the grammar diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 7e66ac99c7..3835dd92a3 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -3,6 +3,11 @@ * sequence.c * PostgreSQL sequences support code. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -41,6 +46,9 @@ /* PGXC_COORD */ #include "access/gtm.h" #include "utils/memutils.h" +#ifdef XCP +#include "utils/timestamp.h" +#endif #endif /* @@ -55,6 +63,12 @@ */ #define SEQ_MAGIC 0x1717 +/* Configuration options */ +#ifdef XCP + +int SequenceRangeVal = 1; +#endif + typedef struct sequence_magic { uint32 magic; @@ -82,6 +96,10 @@ typedef struct SeqTableData /* if last != cached, we have not used up all the cached values */ int64 increment; /* copy of sequence's increment field */ /* note that increment is zero until we first do read_info() */ +#ifdef XCP + TimestampTz last_call_time; /* the time when the last call as made */ + int64 range_multiplier; /* multiply this value with 2 next time */ +#endif } SeqTableData; typedef SeqTableData *SeqTable; @@ -125,7 +143,7 @@ static void init_params(List *options, bool isInit, Form_pg_sequence new, List **owned_by, bool *is_restart); #else static void init_params(List *options, bool isInit, - Form_pg_sequence new, List **owned_by); + Form_pg_sequence new, List **owned_by); #endif static void do_setval(Oid relid, int64 next, bool iscalled); static void process_owned_by(Relation seqrel, List *owned_by); @@ -562,7 +580,6 @@ AlterSequence(AlterSeqStmt *stmt) /* Now okay to update the on-disk tuple */ memcpy(seq, &new, sizeof(FormData_pg_sequence)); - #ifdef PGXC increment = new.increment_by; min_value = new.min_value; @@ -731,19 +748,85 @@ nextval_internal(Oid relid) page = BufferGetPage(buf); #ifdef PGXC /* PGXC_COORD */ +#ifdef XCP + /* Allow nextval executed on datanodes */ + if (!is_temp) +#else if (IS_PGXC_COORDINATOR && !is_temp) +#endif { +#ifdef XCP + int64 range = seq->cache_value; /* how many values to ask from GTM? */ + int64 rangemax; /* the max value returned from the GTM for our request */ +#endif char *seqname = GetGlobalSeqName(seqrel, NULL, NULL); /* * Above, we still use the page as a locking mechanism to handle * concurrency */ +#ifdef XCP + /* + * If the user has set a CACHE parameter, we use that. Else we pass in + * the SequenceRangeVal value + */ + if (range == DEFAULT_CACHEVAL && SequenceRangeVal > range) + { + TimestampTz curtime = GetCurrentTimestamp(); + + if (!TimestampDifferenceExceeds(elm->last_call_time, + curtime, 1000)) + { + /* + * The previous GetNextValGTM call was made just a while back. + * Request double the range of what was requested in the + * earlier call. Honor the SequenceRangeVal boundary + * value to limit very large range requests! + */ + elm->range_multiplier *= 2; + if (elm->range_multiplier < SequenceRangeVal) + range = elm->range_multiplier; + else + elm->range_multiplier = range = SequenceRangeVal; + + elog(DEBUG1, "increase sequence range %ld", range); + } + else if (TimestampDifferenceExceeds(elm->last_call_time, + curtime, 5000)) + { + /* The previous GetNextValGTM call was pretty old */ + range = elm->range_multiplier = DEFAULT_CACHEVAL; + elog(DEBUG1, "reset sequence range %ld", range); + } + else if (TimestampDifferenceExceeds(elm->last_call_time, + curtime, 3000)) + { + /* + * The previous GetNextValGTM call was made quite some time + * ago. Try to reduce the range request to reduce the gap + */ + if (elm->range_multiplier != DEFAULT_CACHEVAL) + { + range = elm->range_multiplier = + rint(elm->range_multiplier/2); + elog(DEBUG1, "decrease sequence range %ld", range); + } + } + else + { + /* + * Current range_multiplier alllows to cache sequence values + * for 1-3 seconds of work. Keep that rate. + */ + range = elm->range_multiplier; + } + elm->last_call_time = curtime; + } + + result = (int64) GetNextValGTM(seqname, range, &rangemax); +#else result = (int64) GetNextValGTM(seqname); - if (result < 0) - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("GTM error, could not obtain sequence value"))); +#endif pfree(seqname); /* Update the on-disk data */ @@ -752,7 +835,11 @@ nextval_internal(Oid relid) /* save info in local cache */ elm->last = result; /* last returned number */ +#ifdef XCP + elm->cached = rangemax; /* last fetched range max limit */ +#else elm->cached = result; /* last fetched number */ +#endif elm->last_valid = true; last_used_seq = elm; @@ -875,11 +962,11 @@ nextval_internal(Oid relid) /* Temporary sequences can go through normal process */ if (is_temp) { -#endif /* * This part is not taken into account, * result has been received from GTM */ +#endif last = next; if (rescnt == 1) /* if it's first result - */ result = next; /* it's what to return */ @@ -896,8 +983,8 @@ nextval_internal(Oid relid) /* Temporary sequences go through normal process */ if (is_temp) { -#endif /* Result has been received from GTM */ +#endif /* save info in local cache */ elm->last = result; /* last returned number */ elm->cached = last; /* last fetched number */ @@ -978,13 +1065,47 @@ currval_oid(PG_FUNCTION_ARGS) errmsg("permission denied for sequence %s", RelationGetRelationName(seqrel)))); +#ifdef XCP + { + /* + * Always contact GTM for currval regardless of valid + * elm->last_valid value + */ + { + char *seqname = GetGlobalSeqName(seqrel, NULL, NULL); + result = (int64) GetCurrentValGTM(seqname); + pfree(seqname); + } + } +#else if (!elm->last_valid) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("currval of sequence \"%s\" is not yet defined in this session", RelationGetRelationName(seqrel)))); +#endif + +#ifndef XCP +#ifdef PGXC + if (IS_PGXC_COORDINATOR && + seqrel->rd_backend != MyBackendId) + { + char *seqname = GetGlobalSeqName(seqrel, NULL, NULL); + result = (int64) GetCurrentValGTM(seqname); + if (result < 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("GTM error, could not obtain sequence value"))); + pfree(seqname); + } + else { +#endif result = elm->last; +#ifdef PGXC + } +#endif +#endif relation_close(seqrel, NoLock); PG_RETURN_INT64(result); @@ -1086,7 +1207,12 @@ do_setval(Oid relid, int64 next, bool iscalled) } #ifdef PGXC +#ifdef XCP + /* Allow to execute on datanodes */ + if (!is_temp) +#else if (IS_PGXC_COORDINATOR && !is_temp) +#endif { char *seqname = GetGlobalSeqName(seqrel, NULL, NULL); @@ -1286,6 +1412,10 @@ init_sequence(Oid relid, SeqTable *p_elm, Relation *p_rel) elm->lxid = InvalidLocalTransactionId; elm->last_valid = false; elm->last = elm->cached = elm->increment = 0; +#ifdef XCP + elm->last_call_time = 0; + elm->range_multiplier = DEFAULT_CACHEVAL; +#endif elm->next = seqtab; seqtab = elm; } @@ -1561,8 +1691,8 @@ init_params(List *options, bool isInit, snprintf(bufm, sizeof(bufm), INT64_FORMAT, new->max_value); ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("START value (%s) cannot be greater than MAXVALUE (%s)", - bufs, bufm))); + errmsg("START value (%s) cannot be greater than MAXVALUE (%s)", + bufs, bufm))); } /* RESTART [WITH] */ @@ -1595,8 +1725,8 @@ init_params(List *options, bool isInit, snprintf(bufm, sizeof(bufm), INT64_FORMAT, new->min_value); ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("RESTART value (%s) cannot be less than MINVALUE (%s)", - bufs, bufm))); + errmsg("RESTART value (%s) cannot be less than MINVALUE (%s)", + bufs, bufm))); } if (new->last_value > new->max_value) { @@ -1607,8 +1737,8 @@ init_params(List *options, bool isInit, snprintf(bufm, sizeof(bufm), INT64_FORMAT, new->max_value); ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("RESTART value (%s) cannot be greater than MAXVALUE (%s)", - bufs, bufm))); + errmsg("RESTART value (%s) cannot be greater than MAXVALUE (%s)", + bufs, bufm))); } /* CACHE */ diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 25da892c7b..76f7a1858d 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -3,6 +3,11 @@ * tablecmds.c * Commands for creating and altering table structures and settings * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -92,7 +97,6 @@ #include "catalog/pgxc_class.h" #include "catalog/pgxc_node.h" #include "commands/sequence.h" -#include "optimizer/pgxcship.h" #include "pgxc/execRemote.h" #include "pgxc/redistrib.h" #endif @@ -672,8 +676,25 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId) /* * Add to pgxc_class. * we need to do this after CommandCounterIncrement - */ + * Distribution info is to be added under the following conditions: + * 1. The create table command is being run on a coordinator + * 2. The create table command is being run in restore mode and + * the statement contains distribute by clause. + * While adding a new datanode to the cluster an existing dump + * that was taken from a datanode is used, and + * While adding a new coordinator to the cluster an exiting dump + * that was taken from a coordinator is used. + * The dump taken from a datanode does NOT contain any DISTRIBUTE BY + * clause. This fact is used here to make sure that when the + * DISTRIBUTE BY clause is missing in the statemnet the system + * should not try to find out the node list itself. + */ +#ifdef XCP + if ((IS_PGXC_COORDINATOR && stmt->distributeby) || + (isRestoreMode && stmt->distributeby != NULL)) +#else if (IS_PGXC_COORDINATOR && relkind == RELKIND_RELATION) +#endif { AddRelationDistribution(relationId, stmt->distributeby, stmt->subcluster, inheritOids, descriptor); @@ -978,13 +999,8 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid, * internal to the group that's being truncated. Finally all the relations * are truncated and reindexed. */ -#ifdef PGXC -void -ExecuteTruncate(TruncateStmt *stmt, const char *sql_statement) -#else void ExecuteTruncate(TruncateStmt *stmt) -#endif { List *rels = NIL; List *relids = NIL; @@ -995,6 +1011,14 @@ ExecuteTruncate(TruncateStmt *stmt) SubTransactionId mySubid; ListCell *cell; +#ifdef PGXC + if (stmt->restart_seqs) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("PGXC does not support RESTART IDENTITY yet"), + errdetail("The feature is not supported currently"))); +#endif + /* * Open, exclusive-lock, and check all the explicitly-specified relations */ @@ -1246,42 +1270,6 @@ ExecuteTruncate(TruncateStmt *stmt) resultRelInfo++; } -#ifdef PGXC - /* - * In Postgres-XC, TRUNCATE needs to be launched to remote nodes before the - * AFTER triggers are launched. This insures that the triggers are being fired - * by correct events. - */ - if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) - { - bool is_temp = false; - RemoteQuery *step = makeNode(RemoteQuery); - - foreach(cell, stmt->relations) - { - Oid relid; - RangeVar *rel = (RangeVar *) lfirst(cell); - - relid = RangeVarGetRelid(rel, NoLock, false); - if (IsTempTable(relid)) - { - is_temp = true; - break; - } - } - - step->combine_type = COMBINE_TYPE_SAME; - step->exec_nodes = NULL; - step->sql_statement = pstrdup(sql_statement); - step->force_autocommit = false; - step->exec_type = EXEC_ON_DATANODES; - step->is_temp = is_temp; - ExecRemoteUtility(step); - pfree(step->sql_statement); - pfree(step); - } -#endif - /* Handle queued AFTER triggers */ AfterTriggerEndQuery(estate); @@ -6363,29 +6351,6 @@ ATAddForeignKeyConstraint(AlteredTableInfo *tab, Relation rel, ffeqoperators[i] = ffeqop; } -#ifdef PGXC - /* Check the shippability of this foreign key */ - if (IS_PGXC_COORDINATOR) - { - List *childRefs = NIL, *parentRefs = NIL; - - /* Prepare call for shippability check */ - for (i = 0; i < numfks; i++) - childRefs = lappend_int(childRefs, fkattnum[i]); - for (i = 0; i < numpks; i++) - parentRefs = lappend_int(parentRefs, pkattnum[i]); - - /* Now check shippability for this foreign key */ - if (!pgxc_check_fk_shippability(GetRelationLocInfo(RelationGetRelid(pkrel)), - GetRelationLocInfo(RelationGetRelid(rel)), - parentRefs, - childRefs)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("Cannot create foreign key whose evaluation cannot be enforced to remote nodes"))); - } -#endif - /* * Record the FK constraint in pg_constraint. */ @@ -6470,6 +6435,15 @@ ATExecValidateConstraint(Relation rel, char *constrName, bool recurse, Form_pg_constraint con = NULL; bool found = false; +#ifdef XCP + /* + * Do not validate distributed relations on Coordinator, let Datanode do + * that when executing the ALTER TABLE statement. + */ + if (IS_PGXC_COORDINATOR && rel->rd_locator_info) + return; +#endif + conrel = heap_open(ConstraintRelationId, RowExclusiveLock); /* @@ -10277,16 +10251,12 @@ ATCheckCmd(Relation rel, AlterTableCmd *cmd) switch (cmd->subtype) { case AT_DropColumn: - { - AttrNumber attnum = get_attnum(RelationGetRelid(rel), - cmd->name); - /* Distribution column cannot be dropped */ - if (IsDistribColumn(RelationGetRelid(rel), attnum)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + /* Distribution column cannot be dropped */ + if (IsDistColumnForRelId(RelationGetRelid(rel), cmd->name)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Distribution column cannot be dropped"))); - break; - } + break; default: break; @@ -10308,6 +10278,9 @@ BuildRedistribCommands(Oid relid, List *subCmds) Oid *new_oid_array; /* Modified list of Oids */ int new_num, i; /* Modified number of Oids */ ListCell *item; +#ifdef XCP + char node_type = PGXC_NODE_DATANODE; +#endif /* Get necessary information about relation */ rel = relation_open(redistribState->relid, NoLock); @@ -10373,49 +10346,18 @@ BuildRedistribCommands(Oid relid, List *subCmds) /* Build relation node list for new locator info */ for (i = 0; i < new_num; i++) +#ifdef XCP + newLocInfo->nodeList = lappend_int(newLocInfo->nodeList, + PGXCNodeGetNodeId(new_oid_array[i], + &node_type)); +#else newLocInfo->nodeList = lappend_int(newLocInfo->nodeList, PGXCNodeGetNodeId(new_oid_array[i], PGXC_NODE_DATANODE)); - +#endif /* Build the command tree for table redistribution */ PGXCRedistribCreateCommandList(redistribState, newLocInfo); - /* - * Using the new locator info already available, check if constraints on - * relation are compatible with the new distribution. - */ - foreach(item, RelationGetIndexList(rel)) - { - Oid indid = lfirst_oid(item); - Relation indexRel = index_open(indid, AccessShareLock); - List *indexColNums = NIL; - int2vector colIds = indexRel->rd_index->indkey; - - /* - * Prepare call to shippability check. Attributes set to 0 correspond - * to index expressions and are evaluated internally, so they are not - * appended in given list. - */ - for (i = 0; i < colIds.dim1; i++) - { - if (colIds.values[i] > 0) - indexColNums = lappend_int(indexColNums, colIds.values[i]); - } - - if (!pgxc_check_index_shippability(newLocInfo, - indexRel->rd_index->indisprimary, - indexRel->rd_index->indisunique, - indexRel->rd_index->indisexclusion, - indexColNums, - indexRel->rd_indexprs)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("Cannot alter table to distribution incompatible " - "with existing constraints"))); - - index_close(indexRel, AccessShareLock); - } - /* Clean up */ FreeRelationLocInfo(newLocInfo); pfree(new_oid_array); @@ -10619,10 +10561,10 @@ AlterTableNamespace(AlterObjectSchemaStmt *stmt) if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && rel->rd_rel->relkind == RELKIND_SEQUENCE && - !IsTempSequence(relid)) + !IsTempSequence(RelationGetRelid(rel))) { char *seqname = GetGlobalSeqName(rel, NULL, NULL); - char *newseqname = GetGlobalSeqName(rel, NULL, stmt->newschema); + char *newseqname = GetGlobalSeqName(rel, NULL, get_namespace_name(nspOid)); /* We also need to rename it on the GTM */ if (RenameSequenceGTM(seqname, newseqname) < 0) @@ -10638,8 +10580,6 @@ AlterTableNamespace(AlterObjectSchemaStmt *stmt) } #endif - /* close rel, but keep lock until commit */ - relation_close(rel, NoLock); } /* @@ -10796,7 +10736,7 @@ AlterSeqNamespaces(Relation classRel, Relation rel, !IsTempSequence(RelationGetRelid(seqRel))) { char *seqname = GetGlobalSeqName(seqRel, NULL, NULL); - char *newseqname = GetGlobalSeqName(seqRel, NULL, newNspName); + char *newseqname = GetGlobalSeqName(seqRel, NULL, get_namespace_name(newNspOid)); /* We also need to rename it on the GTM */ if (RenameSequenceGTM(seqname, newseqname) < 0) diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 04e2cc1acb..b540ac07cd 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -3,6 +3,11 @@ * trigger.c * PostgreSQL TRIGGERs support code. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -56,7 +61,6 @@ #include "utils/tqual.h" #ifdef PGXC #include "pgxc/pgxc.h" -#include "optimizer/pgxcship.h" #endif @@ -90,10 +94,6 @@ static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, int event, bool row_trigger, HeapTuple oldtup, HeapTuple newtup, List *recheckIndexes, Bitmapset *modifiedCols); -#ifdef PGXC -static bool pgxc_is_trigger_shippable(Trigger *trigger); -static bool pgxc_is_trigger_firable(Trigger *trigger); -#endif /* @@ -1932,12 +1932,6 @@ ExecBSInsertTriggers(EState *estate, ResultRelInfo *relinfo) NULL, NULL, NULL)) continue; -#ifdef PGXC - /* Fire the trigger if authorized */ - if (!pgxc_is_trigger_firable(trigger)) - continue; -#endif - LocTriggerData.tg_trigger = trigger; newtuple = ExecCallTriggerFunc(&LocTriggerData, i, @@ -1993,12 +1987,6 @@ ExecBRInsertTriggers(EState *estate, ResultRelInfo *relinfo, NULL, NULL, newtuple)) continue; -#ifdef PGXC - /* Fire the trigger if authorized */ - if (!pgxc_is_trigger_firable(trigger)) - continue; -#endif - LocTriggerData.tg_trigtuple = oldtuple = newtuple; LocTriggerData.tg_trigtuplebuf = InvalidBuffer; LocTriggerData.tg_trigger = trigger; @@ -2074,12 +2062,6 @@ ExecIRInsertTriggers(EState *estate, ResultRelInfo *relinfo, NULL, NULL, newtuple)) continue; -#ifdef PGXC - /* Fire the trigger if authorized */ - if (!pgxc_is_trigger_firable(trigger)) - continue; -#endif - LocTriggerData.tg_trigtuple = oldtuple = newtuple; LocTriggerData.tg_trigtuplebuf = InvalidBuffer; LocTriggerData.tg_trigger = trigger; @@ -2149,12 +2131,6 @@ ExecBSDeleteTriggers(EState *estate, ResultRelInfo *relinfo) NULL, NULL, NULL)) continue; -#ifdef PGXC - /* Fire the trigger if authorized */ - if (!pgxc_is_trigger_firable(trigger)) - continue; -#endif - LocTriggerData.tg_trigger = trigger; newtuple = ExecCallTriggerFunc(&LocTriggerData, i, @@ -2217,12 +2193,6 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, NULL, trigtuple, NULL)) continue; -#ifdef PGXC - /* Fire the trigger if authorized */ - if (!pgxc_is_trigger_firable(trigger)) - continue; -#endif - LocTriggerData.tg_trigtuple = trigtuple; LocTriggerData.tg_trigtuplebuf = InvalidBuffer; LocTriggerData.tg_trigger = trigger; @@ -2290,12 +2260,6 @@ ExecIRDeleteTriggers(EState *estate, ResultRelInfo *relinfo, NULL, trigtuple, NULL)) continue; -#ifdef PGXC - /* Fire the trigger if authorized */ - if (!pgxc_is_trigger_firable(trigger)) - continue; -#endif - LocTriggerData.tg_trigtuple = trigtuple; LocTriggerData.tg_trigtuplebuf = InvalidBuffer; LocTriggerData.tg_trigger = trigger; @@ -2351,12 +2315,6 @@ ExecBSUpdateTriggers(EState *estate, ResultRelInfo *relinfo) modifiedCols, NULL, NULL)) continue; -#ifdef PGXC - /* Fire the trigger if authorized */ - if (!pgxc_is_trigger_firable(trigger)) - continue; -#endif - LocTriggerData.tg_trigger = trigger; newtuple = ExecCallTriggerFunc(&LocTriggerData, i, @@ -2441,12 +2399,6 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, modifiedCols, trigtuple, newtuple)) continue; -#ifdef PGXC - /* Fire the trigger if authorized */ - if (!pgxc_is_trigger_firable(trigger)) - continue; -#endif - LocTriggerData.tg_trigtuple = trigtuple; LocTriggerData.tg_newtuple = oldtuple = newtuple; LocTriggerData.tg_trigtuplebuf = InvalidBuffer; @@ -2534,12 +2486,6 @@ ExecIRUpdateTriggers(EState *estate, ResultRelInfo *relinfo, NULL, trigtuple, newtuple)) continue; -#ifdef PGXC - /* Fire the trigger if authorized */ - if (!pgxc_is_trigger_firable(trigger)) - continue; -#endif - LocTriggerData.tg_trigtuple = trigtuple; LocTriggerData.tg_newtuple = oldtuple = newtuple; LocTriggerData.tg_trigtuplebuf = InvalidBuffer; @@ -2611,12 +2557,6 @@ ExecBSTruncateTriggers(EState *estate, ResultRelInfo *relinfo) NULL, NULL, NULL)) continue; -#ifdef PGXC - /* Fire the trigger if authorized */ - if (!pgxc_is_trigger_firable(trigger)) - continue; -#endif - LocTriggerData.tg_trigger = trigger; newtuple = ExecCallTriggerFunc(&LocTriggerData, i, @@ -4354,7 +4294,15 @@ AfterTriggerSetState(ConstraintsSetStmt *stmt) /* * Not found ? */ +#ifdef XCP + /* + * Constraint exists where table exists, it's OK if constraint is + * not found on a data node. Silently ignore that. + */ + if (!found && !IS_PGXC_DATANODE) +#else if (!found) +#endif ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("constraint \"%s\" does not exist", @@ -4683,12 +4631,6 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, modifiedCols, oldtup, newtup)) continue; -#ifdef PGXC - /* Fire the trigger if authorized */ - if (!pgxc_is_trigger_firable(trigger)) - continue; -#endif - /* * If this is an UPDATE of a PK table or FK table that does not change * the PK or FK respectively, we can skip queuing the event: there is @@ -4766,197 +4708,3 @@ pg_trigger_depth(PG_FUNCTION_ARGS) { PG_RETURN_INT32(MyTriggerDepth); } - - -#ifdef PGXC -/* - * pgxc_check_triggers_shippability - * Check if given relation can be shipped entirely based on its potential - * triggers actions. If at least one trigger is not shippable then the given - * relation cannot be shipped completely to remote nodes for given command - * type. - */ -bool -pgxc_check_triggers_shippability(Oid relid, CmdType commandType) -{ - Relation rel = relation_open(relid, AccessShareLock); - bool res = true; - int i; - TriggerDesc *trigdesc; - - /* Relation has no triggers, can safely return */ - if (!rel->rd_rel->relhastriggers) - goto finish; - - /* Rebuild trigger list if necessary */ - if (rel->rd_rel->relhastriggers && rel->trigdesc == NULL) - RelationBuildTriggers(rel); - - /* Definitely no triggers for this relation */ - if (rel->trigdesc == NULL) - goto finish; - - trigdesc = rel->trigdesc; - - /* - * Check if there are any triggers related to given command - * If there are any, we need to scan the triggers to be sure - * that they are safe. - */ - switch (commandType) - { - case CMD_INSERT: - if (!trigdesc->trig_insert_before_row && - !trigdesc->trig_insert_after_row && - !trigdesc->trig_insert_instead_row && - !trigdesc->trig_insert_before_statement && - !trigdesc->trig_insert_after_statement) - goto finish; - break; - case CMD_UPDATE: - if (!trigdesc->trig_update_before_row && - !trigdesc->trig_update_after_row && - !trigdesc->trig_update_instead_row && - !trigdesc->trig_update_before_statement && - !trigdesc->trig_update_after_statement) - goto finish; - break; - case CMD_DELETE: - if (!trigdesc->trig_delete_before_row && - !trigdesc->trig_delete_after_row && - !trigdesc->trig_delete_instead_row && - !trigdesc->trig_delete_before_statement && - !trigdesc->trig_delete_after_statement) - goto finish; - break; - case CMD_UTILITY: - /* Trigger might be based on an event */ - if (!trigdesc->trig_truncate_before_statement && - !trigdesc->trig_truncate_after_statement) - goto finish; - break; - case CMD_SELECT: - default: - Assert(0); /* Shouldn't come here */ - } - - /* - * By being here, it is sure that there are triggers on this relation - * that are based on events based on the command type invocated. - * So let's scan each potential trigger and be such that it is shippable. - */ - for (i = 0; i < trigdesc->numtriggers; i++) - { - Trigger *trigger = &trigdesc->triggers[i]; - int16 tgtype = trigger->tgtype; - - switch (commandType) - { - case CMD_INSERT: - /* Don't mind if trigger is not involved in INSERT */ - if (!TRIGGER_FOR_INSERT(tgtype)) - continue; - break; - case CMD_UPDATE: - /* Don't mind if trigger is not involved in UPDATE */ - if (!TRIGGER_FOR_UPDATE(tgtype)) - continue; - break; - case CMD_DELETE: - /* Don't mind if trigger is not involved in UPDATE */ - if (!TRIGGER_FOR_DELETE(tgtype)) - continue; - break; - /* Trigger might be on a truncate */ - case CMD_UTILITY: - /* Don't mind if trigger is not involved in TRUNCATE */ - if (!TRIGGER_FOR_TRUNCATE(tgtype)) - continue; - break; - case CMD_SELECT: - default: - Assert(0); /* Shouldn't come here */ - continue; - } - - /* Check trigger shippability */ - res = pgxc_is_trigger_shippable(trigger); - - /* Leave if trigger is not shippable */ - if (!res) - goto finish; - } - -finish: - relation_close(rel, AccessShareLock); - return res; -} - - -/* - * pgxc_is_trigger_shippable - * Depending on the node type where this trigger is evaluated and - * its shippability, determine if the trigger can be fired or not. - */ -static bool -pgxc_is_trigger_firable(Trigger *trigger) -{ - bool is_shippable = pgxc_is_trigger_shippable(trigger); - - /* - * If trigger is based on a constraint or is internal, enforce its launch - * whatever the node type where we are for the time being. - * PGXCTODO: we need to remove this condition once constraints and triggers - * are better implemented within Postgres-XC as a constraint can be locally - * evaluated on remote nodes depending on the distribution type of the table - * on which it is defined or on its parent/child distribution types. - */ - if (trigger->tgisinternal) - return true; - - /* A non-shippable trigger can be fired safely on a local Coordinator */ - if (!is_shippable && IS_PGXC_COORDINATOR && !IsConnFromCoord()) - return true; - - /* A shippable trigger can be fired safely on a remote node */ - if (is_shippable && IsConnFromCoord()) - return true; - - return false; -} - - -/* - * pgxc_is_trigger_shippable - * Check if trigger is shippable to a remote node - */ -static bool -pgxc_is_trigger_shippable(Trigger *trigger) -{ - bool res = true; - - /* - * If trigger is based on a constraint or is internal, enforce its launch - * whatever the node type where we are for the time being. - * PGXCTODO: we need to remove this condition once constraints and triggers - * are better implemented within Postgres-XC as a constraint can be locally - * evaluated on remote nodes depending on the distribution type of the table - * on which it is defined or on its parent/child distribution types. - */ - if (trigger->tgisinternal) - return true; - - /* - * INSTEAD OF triggers can only be defined on views, which are defined - * only on Coordinators, so they cannot be shipped. - */ - if (TRIGGER_FOR_INSTEAD(trigger->tgtype)) - res = false; - - /* Finally check if function called is shippable */ - if (!pgxc_is_func_shippable(trigger->tgfoid)) - res = false; - - return res; -} -#endif diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index f96e0700f8..e638d2898b 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -9,6 +9,11 @@ * in cluster.c. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -51,6 +56,13 @@ #ifdef PGXC #include "pgxc/pgxc.h" #endif +#ifdef XCP +#include "executor/executor.h" +#include "nodes/makefuncs.h" +#include "pgxc/execRemote.h" +#include "pgxc/planner.h" +#include "utils/lsyscache.h" +#endif /* XCP */ /* * GUC parameters @@ -70,7 +82,6 @@ static void vac_truncate_clog(TransactionId frozenXID); static bool vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound); - /* * Primary entry point for VACUUM and ANALYZE commands. * @@ -1091,6 +1102,17 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound) save_sec_context | SECURITY_RESTRICTED_OPERATION); save_nestlevel = NewGUCNestLevel(); +#ifdef XCP + /* + * If we are on coordinator and target relation is distributed, read + * the statistics from the data node instead of vacuuming local relation. + */ + if (IS_PGXC_COORDINATOR && onerel->rd_locator_info) + { + vacuum_rel_coordinator(onerel); + } + else +#endif /* * Do the actual work --- either FULL or "lazy" vacuum */ @@ -1231,3 +1253,281 @@ vacuum_delay_point(void) CHECK_FOR_INTERRUPTS(); } } + +#ifdef XCP +/* + * For the data node query make up TargetEntry representing specified column + * of pg_class catalog table + */ +TargetEntry * +make_relation_tle(Oid reloid, const char *relname, const char *column) +{ + HeapTuple tuple; + Var *var; + Form_pg_attribute att_tup; + TargetEntry *tle; + + tuple = SearchSysCacheAttName(reloid, column); + if (!HeapTupleIsValid(tuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column \"%s\" of relation \"%s\" does not exist", + column, relname))); + att_tup = (Form_pg_attribute) GETSTRUCT(tuple); + + var = makeVar(1, + att_tup->attnum, + att_tup->atttypid, + att_tup->atttypmod, + InvalidOid, + 0); + + tle = makeTargetEntry((Expr *) var, att_tup->attnum, NULL, false); + ReleaseSysCache(tuple); + return tle; +} + + +/* + * Get relation statistics from remote data nodes + * Returns number of nodes that returned correct statistics. + */ +static int +get_remote_relstat(char *nspname, char *relname, bool replicated, + int32 *pages, float4 *tuples, TransactionId *frozenXid) +{ + StringInfoData query; + EState *estate; + MemoryContext oldcontext; + RemoteQuery *step; + RemoteQueryState *node; + TupleTableSlot *result; + int validpages, + validtuples, + validfrozenxids; + + /* Make up query string */ + initStringInfo(&query); + appendStringInfo(&query, "SELECT c.relpages, " + "c.reltuples, " + "c.relfrozenxid " + "FROM pg_class c JOIN pg_namespace n " + "ON c.relnamespace = n.oid " + "WHERE n.nspname = '%s' " + "AND c.relname = '%s'", + nspname, relname); + + /* Build up RemoteQuery */ + step = makeNode(RemoteQuery); + + step->combine_type = COMBINE_TYPE_NONE; + step->exec_nodes = NULL; + step->sql_statement = query.data; + step->force_autocommit = true; + step->exec_type = EXEC_ON_DATANODES; + + /* Add targetlist entries */ + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, + "pg_class", + "relpages")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, + "pg_class", + "reltuples")); + step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, + make_relation_tle(RelationRelationId, + "pg_class", + "relfrozenxid")); + + /* Execute query on the data nodes */ + estate = CreateExecutorState(); + + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + + estate->es_snapshot = GetActiveSnapshot(); + + node = ExecInitRemoteQuery(step, estate, 0); + MemoryContextSwitchTo(oldcontext); + /* get ready to combine results */ + *pages = 0; + *tuples = 0.0; + *frozenXid = InvalidTransactionId; + validpages = 0; + validtuples = 0; + validfrozenxids = 0; + result = ExecRemoteQuery(node); + while (result != NULL && !TupIsNull(result)) + { + Datum value; + bool isnull; + /* Process statistics from the data node */ + value = slot_getattr(result, 1, &isnull); /* relpages */ + if (!isnull) + { + validpages++; + *pages += DatumGetInt32(value); + } + value = slot_getattr(result, 2, &isnull); /* reltuples */ + if (!isnull) + { + validtuples++; + *tuples += DatumGetFloat4(value); + } + value = slot_getattr(result, 3, &isnull); /* relfrozenxid */ + if (!isnull) + { + /* + * relfrozenxid on coordinator should be the lowest one from the + * datanodes. + */ + TransactionId xid = DatumGetTransactionId(value); + if (TransactionIdIsValid(xid)) + { + validfrozenxids++; + if (!TransactionIdIsValid(*frozenXid) || + TransactionIdPrecedes(xid, *frozenXid)) + { + *frozenXid = xid; + } + } + } + /* fetch next */ + result = ExecRemoteQuery(node); + } + ExecEndRemoteQuery(node); + + if (replicated) + { + /* + * Normally numbers should be the same on the nodes, but relations + * are autovacuum'ed independedly, so they may differ. + * Average is good enough approximation in this case. + */ + if (validpages > 0) + *pages /= validpages; + + if (validtuples > 0) + *tuples /= validtuples; + } + + if (validfrozenxids < validpages || validfrozenxids < validtuples) + { + /* + * If some node returned invalid value for frozenxid we can not set + * it on coordinator. There are other cases when returned value of + * frozenXid should be ignored, these cases are checked by caller. + * Basically, to be sure, there should be one value from each node, + * where the table is partitioned. + */ + *frozenXid = InvalidTransactionId; + return Max(validpages, validtuples); + } + else + { + return validfrozenxids; + } +} + + +/* + * Coordinator does not contain any data, so we never need to vacuum relations. + * This function only updates optimizer statistics based on info from the + * data nodes. + */ +void +vacuum_rel_coordinator(Relation onerel) +{ + char *nspname; + char *relname; + /* fields to combine relation statistics */ + int32 num_pages; + float4 num_tuples; + TransactionId min_frozenxid; + bool hasindex; + bool replicated; + int rel_nodes; + + /* Get the relation identifier */ + relname = RelationGetRelationName(onerel); + nspname = get_namespace_name(RelationGetNamespace(onerel)); + + elog(LOG, "Getting relation statistics for %s.%s", nspname, relname); + + replicated = IsLocatorReplicated(RelationGetLocatorType(onerel)); + /* + * Get stats from the remote nodes. Function returns the number of nodes + * returning correct stats. + */ + rel_nodes = get_remote_relstat(nspname, relname, replicated, + &num_pages, &num_tuples, &min_frozenxid); + if (rel_nodes > 0) + { + int nindexes; + Relation *Irel; + int nodes = list_length(RelationGetLocInfo(onerel)->nodeList); + + vac_open_indexes(onerel, ShareUpdateExclusiveLock, &nindexes, &Irel); + hasindex = (nindexes > 0); + + if (hasindex) + { + int i; + + /* Fetch index stats */ + for (i = 0; i < nindexes; i++) + { + int32 idx_pages; + float4 idx_tuples; + TransactionId idx_frozenxid; + int idx_nodes; + + /* Get the index identifier */ + relname = RelationGetRelationName(Irel[i]); + nspname = get_namespace_name(RelationGetNamespace(Irel[i])); + /* Index is replicated if parent relation is replicated */ + idx_nodes = get_remote_relstat(nspname, relname, replicated, + &idx_pages, &idx_tuples, &idx_frozenxid); + if (idx_nodes > 0) + { + /* + * Do not update the frozenxid if information was not from + * all the expected nodes. + */ + if (idx_nodes < nodes) + { + idx_frozenxid = InvalidTransactionId; + } + /* save changes */ + vac_update_relstats(Irel[i], + (BlockNumber) idx_pages, + (double) idx_tuples, + 0, + false, + idx_frozenxid); + } + } + } + + /* Done with indexes */ + vac_close_indexes(nindexes, Irel, NoLock); + + /* + * Do not update the frozenxid if information was not from all + * the expected nodes. + */ + if (rel_nodes < nodes) + { + min_frozenxid = InvalidTransactionId; + } + + /* save changes */ + vac_update_relstats(onerel, + (BlockNumber) num_pages, + (double) num_tuples, + visibilitymap_count(onerel), + hasindex, + min_frozenxid); + } +} +#endif diff --git a/src/backend/commands/variable.c b/src/backend/commands/variable.c index 112703819e..ea2211a27b 100644 --- a/src/backend/commands/variable.c +++ b/src/backend/commands/variable.c @@ -4,6 +4,11 @@ * Routines for handling specialized SET variables. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -20,6 +25,9 @@ #include "access/xact.h" #include "catalog/pg_authid.h" +#ifdef XCP +#include "catalog/pgxc_node.h" +#endif #include "commands/variable.h" #include "miscadmin.h" #include "utils/acl.h" @@ -890,6 +898,126 @@ assign_session_authorization(const char *newval, void *extra) } +#ifdef XCP + +/* + * SET GLOBAL SESSION + */ + +typedef struct +{ + /* This is the "extra" state for GLOBAL SESSION */ + Oid coordid; + int coordpid; +} global_session_extra; + + +bool +check_global_session(char **newval, void **extra, GucSource source) +{ + HeapTuple coordTup; + Oid coordid; + char *separatorPos; + int coordpid; + global_session_extra *myextra; + + /* Do nothing for the boot_val default of NULL */ + if (*newval == NULL) + return true; + + if (strcmp(*newval, "none") == 0) + { + /* hardwired translation */ + coordid = InvalidOid; + coordpid = 0; + } + else + { + if (!IsTransactionState()) + { + /* + * Can't do catalog lookups, so fail. The result of this is that + * global_session cannot be set in postgresql.conf, which seems + * like a good thing anyway, so we don't work hard to avoid it. + */ + return false; + } + + /* + * Get pointer on '_' character separating coordinator name from pid in the + * global session identifier + */ + separatorPos = strrchr(*newval, '_'); + if (separatorPos == NULL) + { + GUC_check_errmsg("malformed Global Session identifier: \"%s\"", *newval); + return false; + } + + /* + * The pid is written immediately after the separator + */ + coordpid = atoi(separatorPos + 1); + if (coordpid <= 0) + { + GUC_check_errmsg("malformed Global Session identifier: \"%s\"", *newval); + return false; + } + + + /* + * Temporary truncate the Global Session identifier to extract session name + */ + *separatorPos = '\0'; + /* Look up the nodename */ + coordTup = SearchSysCache1(PGXCNODENAME, PointerGetDatum(*newval)); + if (!HeapTupleIsValid(coordTup)) + { + *separatorPos = '_'; + GUC_check_errmsg("node \"%s\" does not exist", *newval); + return false; + } + + if (((Form_pgxc_node) GETSTRUCT(coordTup))->node_type != PGXC_NODE_COORDINATOR) + { + ReleaseSysCache(coordTup); + *separatorPos = '_'; + GUC_check_errmsg("node \"%s\" is not a coordinator", *newval); + return false; + } + + coordid = HeapTupleGetOid(coordTup); + + *separatorPos = '_'; + ReleaseSysCache(coordTup); + } + + /* Set up "extra" struct for assign_session_authorization to use */ + myextra = (global_session_extra *) malloc(sizeof(global_session_extra)); + if (!myextra) + return false; + myextra->coordid = coordid; + myextra->coordpid = coordpid; + *extra = (void *) myextra; + + return true; +} + + +void +assign_global_session(const char *newval, void *extra) +{ + global_session_extra *myextra = (global_session_extra *) extra; + + /* Do nothing for the boot_val default of NULL */ + if (!myextra) + return; + + SetGlobalSession(myextra->coordid, myextra->coordpid); +} +#endif + + /* * SET ROLE * diff --git a/src/backend/commands/view.c b/src/backend/commands/view.c index 6ab0ce8345..cdd7c64870 100644 --- a/src/backend/commands/view.c +++ b/src/backend/commands/view.c @@ -3,6 +3,11 @@ * view.c * use rewrite rules to construct views * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -511,6 +516,14 @@ DefineView(ViewStmt *stmt, const char *queryString) if (view->relpersistence == RELPERSISTENCE_PERMANENT && isViewOnTempTable(viewParse)) { + view = copyObject(view); /* don't corrupt original command */ +#ifdef XCP + /* + * Change original command as well - we do not want to create that view + * on other coordinators where temp table does not exist + */ + stmt->view->relpersistence = RELPERSISTENCE_TEMP; +#endif view->relpersistence = RELPERSISTENCE_TEMP; ereport(NOTICE, (errmsg("view \"%s\" will be a temporary view", @@ -518,10 +531,12 @@ DefineView(ViewStmt *stmt, const char *queryString) } #ifdef PGXC +#ifndef XCP /* In case view is temporary, be sure not to use 2PC on such relations */ if (view->relpersistence == RELPERSISTENCE_TEMP) ExecSetTempObjectIncluded(); #endif +#endif /* * Create the view relation diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile index 6081b56c08..ffe97a90b2 100644 --- a/src/backend/executor/Makefile +++ b/src/backend/executor/Makefile @@ -24,6 +24,6 @@ OBJS = execAmi.o execCurrent.o execGrouping.o execJunk.o execMain.o \ nodeSeqscan.o nodeSetOp.o nodeSort.o nodeUnique.o \ nodeValuesscan.o nodeCtescan.o nodeWorktablescan.o \ nodeGroup.o nodeSubplan.o nodeSubqueryscan.o nodeTidscan.o \ - nodeForeignscan.o nodeWindowAgg.o tstoreReceiver.o spi.o + nodeForeignscan.o nodeWindowAgg.o producerReceiver.o tstoreReceiver.o spi.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 462d137e29..4008f39582 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -3,6 +3,11 @@ * execAmi.c * miscellaneous executor access method routines * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -200,10 +205,16 @@ ExecReScan(PlanState *node) break; #ifdef PGXC +#ifdef XCP + case T_RemoteSubplanState: + ExecReScanRemoteSubplan((RemoteSubplanState *) node); + break; +#else case T_RemoteQueryState: ExecRemoteQueryReScan((RemoteQueryState *) node, node->ps_ExprContext); break; #endif +#endif case T_NestLoopState: ExecReScanNestLoop((NestLoopState *) node); break; diff --git a/src/backend/executor/execCurrent.c b/src/backend/executor/execCurrent.c index 9eaa4710d5..e1a140b7dd 100644 --- a/src/backend/executor/execCurrent.c +++ b/src/backend/executor/execCurrent.c @@ -3,6 +3,11 @@ * execCurrent.c * executor support for WHERE CURRENT OF cursor * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -268,6 +273,7 @@ search_plan_tree(PlanState *node, Oid table_oid) switch (nodeTag(node)) { #ifdef PGXC +#ifndef XCP case T_RemoteQueryState: { RemoteQueryState *rqs = (RemoteQueryState *) node; @@ -275,6 +281,7 @@ search_plan_tree(PlanState *node, Oid table_oid) return sstate; } #endif +#endif /* * scan nodes can all be treated alike */ diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 3a95dd109b..1283b39e89 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -26,6 +26,11 @@ * before ExecutorEnd. This can be omitted only in case of EXPLAIN, * which should also omit ExecutorRun. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -59,6 +64,11 @@ #include "pgxc/pgxc.h" #include "commands/copy.h" #endif +#ifdef XCP +#include "access/gtm.h" +#include "pgxc/execRemote.h" +#include "pgxc/poolmgr.h" +#endif /* Hooks for plugins to get control in ExecutorStart/Run/Finish/End */ ExecutorStart_hook_type ExecutorStart_hook = NULL; @@ -153,8 +163,39 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags) estate->es_param_list_info = queryDesc->params; if (queryDesc->plannedstmt->nParamExec > 0) +#ifdef XCP + { + estate->es_param_exec_vals = (ParamExecData *) + palloc0(queryDesc->plannedstmt->nParamExec * sizeof(ParamExecData)); + if (queryDesc->plannedstmt->nParamRemote > 0) + { + ParamListInfo extparams = estate->es_param_list_info; + int i = queryDesc->plannedstmt->nParamRemote; + while (--i >= 0 && + queryDesc->plannedstmt->remoteparams[i].paramkind == PARAM_EXEC) + { + int paramno = queryDesc->plannedstmt->remoteparams[i].paramid; + ParamExecData *prmdata; + + Assert(paramno >= 0 && + paramno < queryDesc->plannedstmt->nParamExec); + prmdata = &(estate->es_param_exec_vals[paramno]); + prmdata->value = extparams->params[i].value; + prmdata->isnull = extparams->params[i].isnull; + prmdata->ptype = extparams->params[i].ptype; + } + /* + * Truncate exec parameters from the list of received parameters + * to avoid sending down duplicates if there are multiple levels + * of RemoteSubplan statements + */ + extparams->numParams = i + 1; + } + } +#else estate->es_param_exec_vals = (ParamExecData *) palloc0(queryDesc->plannedstmt->nParamExec * sizeof(ParamExecData)); +#endif /* * If non-read-only query, set the command ID to mark output tuples with @@ -766,8 +807,10 @@ InitPlan(QueryDesc *queryDesc, int eflags) /* es_result_relation_info is NULL except when within ModifyTable */ estate->es_result_relation_info = NULL; #ifdef PGXC +#ifndef XCP estate->es_result_remoterel = NULL; #endif +#endif } else { @@ -778,7 +821,9 @@ InitPlan(QueryDesc *queryDesc, int eflags) estate->es_num_result_relations = 0; estate->es_result_relation_info = NULL; #ifdef PGXC - estate->es_result_remoterel = NULL; +#ifndef XCP +estate->es_result_remoterel = NULL; +#endif #endif } @@ -869,6 +914,16 @@ InitPlan(QueryDesc *queryDesc, int eflags) sp_eflags = eflags & EXEC_FLAG_EXPLAIN_ONLY; if (bms_is_member(i, plannedstmt->rewindPlanIDs)) sp_eflags |= EXEC_FLAG_REWIND; +#ifdef XCP + /* + * Distributed executor may never execute that plan because referencing + * subplan is executed on remote node, so we may save some resources. + * At the moment only RemoteSubplan is aware of this flag, it is + * skipping sending down subplan. + * ExecInitSubPlan takes care about finishing initialization. + */ + sp_eflags |= EXEC_FLAG_SUBPLAN; +#endif subplanstate = ExecInitNode(subplan, estate, sp_eflags); @@ -894,7 +949,15 @@ InitPlan(QueryDesc *queryDesc, int eflags) * Initialize the junk filter if needed. SELECT queries need a filter if * there are any junk attrs in the top-level tlist. */ +#ifdef XCP + /* + * We need to keep junk attrs in intermediate results, they may be needed + * in upper level plans on the receiving side + */ + if (!IS_PGXC_DATANODE && operation == CMD_SELECT) +#else if (operation == CMD_SELECT) +#endif { bool junk_filter_needed = false; ListCell *tlist; @@ -2244,9 +2307,11 @@ EvalPlanQualStart(EPQState *epqstate, EState *parentestate, Plan *planTree) estate->es_result_relation_info = parentestate->es_result_relation_info; #ifdef PGXC +#ifndef XCP /* XXX Check if this is OK */ estate->es_result_remoterel = parentestate->es_result_remoterel; #endif +#endif /* es_trig_target_relations must NOT be copied */ estate->es_rowMarks = parentestate->es_rowMarks; diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index b7dc1f311d..7685152285 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -7,6 +7,11 @@ * ExecProcNode, or ExecEndNode on its subnodes and do the appropriate * processing. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -322,6 +327,12 @@ ExecInitNode(Plan *node, EState *estate, int eflags) estate, eflags); break; #endif +#ifdef XCP + case T_RemoteSubplan: + result = (PlanState *) ExecInitRemoteSubplan((RemoteSubplan *) node, + estate, eflags); + break; +#endif /* XCP */ default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); @@ -334,6 +345,15 @@ ExecInitNode(Plan *node, EState *estate, int eflags) * a separate list for us. */ subps = NIL; +#ifdef XCP + /* + * If plan being initialized during we should skip doing initPlan here. + * In case the plan is actually referenced on this step of the distributed + * plan it will be done in ExecFinishInitProcNode + */ + if (!(eflags & EXEC_FLAG_SUBPLAN)) + { +#endif foreach(l, node->initPlan) { SubPlan *subplan = (SubPlan *) lfirst(l); @@ -343,6 +363,9 @@ ExecInitNode(Plan *node, EState *estate, int eflags) sstate = ExecInitSubPlan(subplan, result); subps = lappend(subps, sstate); } +#ifdef XCP + } +#endif result->initPlan = subps; /* Set up instrumentation for this node if requested */ @@ -353,6 +376,67 @@ ExecInitNode(Plan *node, EState *estate, int eflags) } +#ifdef XCP +/* + * The subplan is referenced on local node, finish initialization + */ +void +ExecFinishInitProcNode(PlanState *node) +{ + List *subps; + ListCell *l; + + /* Exit if we reached leaf of the tree */ + if (node == NULL) + return; + + /* Special cases */ + switch (nodeTag(node)) + { + case T_RemoteSubplanState: + ExecFinishInitRemoteSubplan((RemoteSubplanState *) node); + break; + + case T_AppendState: + { + AppendState *append = (RemoteSubplanState *) node; + int i; + + for (i = 0; i < append->as_nplans; i++) + ExecFinishInitProcNode(append->appendplans[i]); + + break; + } + + case T_SubqueryScanState: + ExecFinishInitProcNode(((SubqueryScanState *) node)->subplan); + break; + + default: + break; + } + + /* + * Common case, recurse the tree + */ + ExecFinishInitProcNode(node->lefttree); + ExecFinishInitProcNode(node->righttree); + + subps = NIL; + foreach(l, node->plan->initPlan) + { + SubPlan *subplan = (SubPlan *) lfirst(l); + SubPlanState *sstate; + + Assert(IsA(subplan, SubPlan)); + sstate = ExecInitSubPlan(subplan, node); + subps = lappend(subps, sstate); + } + node->initPlan = subps; +} +#endif + + /* ---------------------------------------------------------------- * ExecProcNode * @@ -513,6 +597,11 @@ ExecProcNode(PlanState *node) result = ExecRemoteQuery((RemoteQueryState *) node); break; #endif +#ifdef XCP + case T_RemoteSubplanState: + result = ExecRemoteSubplan((RemoteSubplanState *) node); + break; +#endif /* XCP */ default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); @@ -755,6 +844,11 @@ ExecEndNode(PlanState *node) ExecEndRemoteQuery((RemoteQueryState *) node); break; #endif +#ifdef XCP + case T_RemoteSubplanState: + ExecEndRemoteSubplan((RemoteSubplanState *) node); + break; +#endif /* XCP */ default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c index ad7c569f93..0652e9d34b 100644 --- a/src/backend/executor/execTuples.c +++ b/src/backend/executor/execTuples.c @@ -12,6 +12,11 @@ * This information is needed by routines manipulating tuples * (getattribute, formtuple, etc.). * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -95,7 +100,9 @@ #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/typcache.h" - +#ifdef XCP +#include "pgxc/pgxc.h" +#endif static TupleDesc ExecTypeFromTLInternal(List *targetList, bool hasoid, bool skipjunk); @@ -124,8 +131,13 @@ MakeTupleTableSlot(void) slot->tts_tupleDescriptor = NULL; #ifdef PGXC slot->tts_shouldFreeRow = false; +#ifdef XCP + slot->tts_datarow = NULL; + slot->tts_drowcxt = NULL; +#else slot->tts_dataRow = NULL; slot->tts_dataLen = -1; +#endif slot->tts_attinmeta = NULL; #endif slot->tts_mcxt = CurrentMemoryContext; @@ -359,13 +371,26 @@ ExecStoreTuple(HeapTuple tuple, if (slot->tts_shouldFreeMin) heap_free_minimal_tuple(slot->tts_mintuple); #ifdef PGXC +#ifdef XCP + if (slot->tts_shouldFreeRow) + { + pfree(slot->tts_datarow); + if (slot->tts_drowcxt) + MemoryContextReset(slot->tts_drowcxt); + } +#else if (slot->tts_shouldFreeRow) pfree(slot->tts_dataRow); +#endif slot->tts_shouldFreeRow = false; +#ifdef XCP + slot->tts_datarow = NULL; +#else slot->tts_dataRow = NULL; slot->tts_dataLen = -1; #endif +#endif /* * Store the new tuple into the specified slot. @@ -428,13 +453,26 @@ ExecStoreMinimalTuple(MinimalTuple mtup, if (slot->tts_shouldFreeMin) heap_free_minimal_tuple(slot->tts_mintuple); #ifdef PGXC +#ifdef XCP + if (slot->tts_shouldFreeRow) + { + pfree(slot->tts_datarow); + if (slot->tts_drowcxt) + MemoryContextReset(slot->tts_drowcxt); + } +#else if (slot->tts_shouldFreeRow) pfree(slot->tts_dataRow); +#endif slot->tts_shouldFreeRow = false; +#ifdef XCP + slot->tts_datarow = NULL; +#else slot->tts_dataRow = NULL; slot->tts_dataLen = -1; #endif +#endif /* * Drop the pin on the referenced buffer, if there is one. @@ -487,13 +525,22 @@ ExecClearTuple(TupleTableSlot *slot) /* slot in which to store tuple */ if (slot->tts_shouldFreeMin) heap_free_minimal_tuple(slot->tts_mintuple); #ifdef PGXC +#ifdef XCP + if (slot->tts_shouldFreeRow) + pfree(slot->tts_datarow); +#else if (slot->tts_shouldFreeRow) pfree(slot->tts_dataRow); +#endif slot->tts_shouldFreeRow = false; +#ifdef XCP + slot->tts_datarow = NULL; +#else slot->tts_dataRow = NULL; slot->tts_dataLen = -1; #endif +#endif slot->tts_tuple = NULL; slot->tts_mintuple = NULL; @@ -605,9 +652,14 @@ ExecCopySlotTuple(TupleTableSlot *slot) /* * Ensure values are extracted from data row to the Datum array */ +#ifdef XCP + if (slot->tts_datarow) + slot_getallattrs(slot); +#else if (slot->tts_dataRow) slot_getallattrs(slot); #endif +#endif /* * Otherwise we need to build a tuple from the Datum array. */ @@ -644,9 +696,14 @@ ExecCopySlotMinimalTuple(TupleTableSlot *slot) /* * Ensure values are extracted from data row to the Datum array */ +#ifdef XCP + if (slot->tts_datarow) + slot_getallattrs(slot); +#else if (slot->tts_dataRow) slot_getallattrs(slot); #endif +#endif /* * Otherwise we need to build a tuple from the Datum array. */ @@ -655,6 +712,191 @@ ExecCopySlotMinimalTuple(TupleTableSlot *slot) slot->tts_isnull); } +#ifdef PGXC +#ifdef XCP +/* -------------------------------- + * ExecCopySlotDatarow + * Obtain a copy of a slot's data row. The copy is + * palloc'd in the current memory context. + * The slot itself is undisturbed + * -------------------------------- + */ +RemoteDataRow +ExecCopySlotDatarow(TupleTableSlot *slot, MemoryContext tmpcxt) +{ + RemoteDataRow datarow; + if (slot->tts_datarow) + { + int len = slot->tts_datarow->msglen; + /* if we already have datarow make a copy */ + datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + len); + datarow->msgnode = slot->tts_datarow->msgnode; + datarow->msglen = len; + memcpy(datarow->msg, slot->tts_datarow->msg, len); + return datarow; + } + else + { + TupleDesc tdesc = slot->tts_tupleDescriptor; + MemoryContext savecxt = NULL; + StringInfoData buf; + uint16 n16; + int i; + + /* ensure we have all values */ + slot_getallattrs(slot); + + /* if temporary memory context is specified reset it */ + if (tmpcxt) + { + MemoryContextReset(tmpcxt); + savecxt = MemoryContextSwitchTo(tmpcxt); + } + + initStringInfo(&buf); + /* Number of parameter values */ + n16 = htons(tdesc->natts); + appendBinaryStringInfo(&buf, (char *) &n16, 2); + + for (i = 0; i < tdesc->natts; i++) + { + uint32 n32; + + if (slot->tts_isnull[i]) + { + n32 = htonl(-1); + appendBinaryStringInfo(&buf, (char *) &n32, 4); + } + else + { + Form_pg_attribute attr = tdesc->attrs[i]; + Oid typOutput; + bool typIsVarlena; + Datum pval; + char *pstring; + int len; + + /* Get info needed to output the value */ + getTypeOutputInfo(attr->atttypid, &typOutput, &typIsVarlena); + /* + * If we have a toasted datum, forcibly detoast it here to avoid + * memory leakage inside the type's output routine. + */ + if (typIsVarlena) + pval = PointerGetDatum(PG_DETOAST_DATUM(slot->tts_values[i])); + else + pval = slot->tts_values[i]; + + /* Convert Datum to string */ + pstring = OidOutputFunctionCall(typOutput, pval); + + /* copy data to the buffer */ + len = strlen(pstring); + n32 = htonl(len); + appendBinaryStringInfo(&buf, (char *) &n32, 4); + appendBinaryStringInfo(&buf, pstring, len); + } + } + + /* restore memory context to allocate result */ + if (savecxt) + { + MemoryContextSwitchTo(savecxt); + } + + /* copy data to the buffer */ + datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + buf.len); + datarow->msgnode = InvalidOid; + datarow->msglen = buf.len; + memcpy(datarow->msg, buf.data, buf.len); + pfree(buf.data); + return datarow; + } +} +#else +/* -------------------------------- + * ExecCopySlotDatarow + * Obtain a copy of a slot's data row. The copy is + * palloc'd in the current memory context. + * Pointer to the datarow is returned as a var parameter, function + * returns the length of the data row + * The slot itself is undisturbed + * -------------------------------- + */ +int +ExecCopySlotDatarow(TupleTableSlot *slot, char **datarow) +{ + Assert(datarow); + + if (slot->tts_dataRow) + { + /* if we already have datarow make a copy */ + *datarow = (char *)palloc(slot->tts_dataLen); + memcpy(*datarow, slot->tts_dataRow, slot->tts_dataLen); + return slot->tts_dataLen; + } + else + { + TupleDesc tdesc = slot->tts_tupleDescriptor; + StringInfoData buf; + uint16 n16; + int i; + + initStringInfo(&buf); + /* Number of parameter values */ + n16 = htons(tdesc->natts); + appendBinaryStringInfo(&buf, (char *) &n16, 2); + + /* ensure we have all values */ + slot_getallattrs(slot); + for (i = 0; i < tdesc->natts; i++) + { + uint32 n32; + + if (slot->tts_isnull[i]) + { + n32 = htonl(-1); + appendBinaryStringInfo(&buf, (char *) &n32, 4); + } + else + { + Form_pg_attribute attr = tdesc->attrs[i]; + Oid typOutput; + bool typIsVarlena; + Datum pval; + char *pstring; + int len; + + /* Get info needed to output the value */ + getTypeOutputInfo(attr->atttypid, &typOutput, &typIsVarlena); + /* + * If we have a toasted datum, forcibly detoast it here to avoid + * memory leakage inside the type's output routine. + */ + if (typIsVarlena) + pval = PointerGetDatum(PG_DETOAST_DATUM(slot->tts_values[i])); + else + pval = slot->tts_values[i]; + + /* Convert Datum to string */ + pstring = OidOutputFunctionCall(typOutput, pval); + + /* copy data to the buffer */ + len = strlen(pstring); + n32 = htonl(len); + appendBinaryStringInfo(&buf, (char *) &n32, 4); + appendBinaryStringInfo(&buf, pstring, len); + } + } + /* copy data to the buffer */ + *datarow = palloc(buf.len); + memcpy(*datarow, buf.data, buf.len); + pfree(buf.data); + return buf.len; + } +} +#endif +#endif /* -------------------------------- * ExecFetchSlotTuple @@ -844,8 +1086,12 @@ ExecMaterializeSlot(TupleTableSlot *slot) #ifdef PGXC if (!slot->tts_shouldFreeRow) { +#ifdef XCP + slot->tts_datarow = NULL; +#else slot->tts_dataRow = NULL; slot->tts_dataLen = -1; +#endif } #endif @@ -1288,6 +1534,58 @@ end_tup_output(TupOutputState *tstate) * * -------------------------------- */ +#ifdef XCP +TupleTableSlot * +ExecStoreDataRowTuple(RemoteDataRow datarow, + TupleTableSlot *slot, + bool shouldFree) +{ + /* + * sanity checks + */ + Assert(datarow != NULL); + Assert(slot != NULL); + Assert(slot->tts_tupleDescriptor != NULL); + + /* + * Free any old physical tuple belonging to the slot. + */ + if (slot->tts_shouldFree) + heap_freetuple(slot->tts_tuple); + if (slot->tts_shouldFreeMin) + heap_free_minimal_tuple(slot->tts_mintuple); + if (slot->tts_shouldFreeRow) + { + pfree(slot->tts_datarow); + if (slot->tts_drowcxt) + MemoryContextReset(slot->tts_drowcxt); + } + + /* + * Drop the pin on the referenced buffer, if there is one. + */ + if (BufferIsValid(slot->tts_buffer)) + ReleaseBuffer(slot->tts_buffer); + + slot->tts_buffer = InvalidBuffer; + + /* + * Store the new tuple into the specified slot. + */ + slot->tts_isempty = false; + slot->tts_shouldFree = false; + slot->tts_shouldFreeMin = false; + slot->tts_shouldFreeRow = shouldFree; + slot->tts_tuple = NULL; + slot->tts_mintuple = NULL; + slot->tts_datarow = datarow; + + /* Mark extracted state invalid */ + slot->tts_nvalid = 0; + + return slot; +} +#else TupleTableSlot * ExecStoreDataRowTuple(char *msg, size_t len, TupleTableSlot *slot, bool shouldFree) @@ -1343,3 +1641,4 @@ ExecStoreDataRowTuple(char *msg, size_t len, TupleTableSlot *slot, return slot; } #endif +#endif diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index ac39b1fbeb..f8fadb26d9 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -3,6 +3,11 @@ * execUtils.c * miscellaneous executor utility routines * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -117,9 +122,11 @@ CreateExecutorState(void) estate->es_result_relations = NULL; estate->es_num_result_relations = 0; estate->es_result_relation_info = NULL; -#ifdef PGXC +#ifdef PGXC +#ifndef XCP estate->es_result_remoterel = NULL; #endif +#endif estate->es_trig_target_relations = NIL; estate->es_trig_tuple_slot = NULL; diff --git a/src/backend/executor/functions.c b/src/backend/executor/functions.c index e6b57539b4..6af20e10dd 100644 --- a/src/backend/executor/functions.c +++ b/src/backend/executor/functions.c @@ -491,6 +491,24 @@ init_execution_state(List *queryTree_list, errmsg("%s is not allowed in a non-volatile function", CreateCommandTag(stmt)))); +#ifdef PGXC + if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + { + if (queryTree->commandType != CMD_UTILITY) + { + /* + * The parameterised queries in RemoteQuery nodes will be prepared + * on the Datanode, and need parameter types for the same. Set the + * parameter types and their number in all RemoteQuery nodes in the + * plan + */ + SetRemoteStatementName(((PlannedStmt *)stmt)->planTree, NULL, + fcache->pinfo->nargs, + fcache->pinfo->argtypes, 0); + } + } +#endif /* PGXC */ + /* OK, build the execution_state for this query */ newes = (execution_state *) palloc(sizeof(execution_state)); if (preves) diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index f94d7452d1..dc16e88454 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -67,6 +67,11 @@ * but direct examination of the node is needed to use it before 9.0. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -179,9 +184,15 @@ typedef struct AggStatePerAggData */ int16 inputtypeLen, resulttypeLen, +#ifdef XCP + collecttypeLen, +#endif transtypeLen; bool inputtypeByVal, resulttypeByVal, +#ifdef XCP + collecttypeByVal, +#endif transtypeByVal; /* @@ -520,6 +531,7 @@ advance_transition_function(AggState *aggstate, } #ifdef PGXC +#ifndef XCP /* * Given new input value(s), advance the collection function of an aggregate. * @@ -624,6 +636,7 @@ advance_collection_function(AggState *aggstate, MemoryContextSwitchTo(oldContext); } +#endif /* XCP */ #endif /* PGXC */ /* @@ -696,6 +709,7 @@ advance_aggregates(AggState *aggstate, AggStatePerGroup pergroup) } #ifdef PGXC +#ifndef XCP if (aggstate->skip_trans) { Assert(IS_PGXC_COORDINATOR); @@ -707,6 +721,7 @@ advance_aggregates(AggState *aggstate, AggStatePerGroup pergroup) pergroupstate, &fcinfo); } else +#endif /* XCP */ #endif /* PGXC */ advance_transition_function(aggstate, peraggstate, pergroupstate, &fcinfo); @@ -906,8 +921,52 @@ finalize_aggregate(AggState *aggstate, Datum *resultVal, bool *resultIsNull) { MemoryContext oldContext; +#ifdef XCP + Datum value; + bool isnull; +#endif oldContext = MemoryContextSwitchTo(aggstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory); +#ifdef XCP + if (OidIsValid(peraggstate->collectfn_oid)) + { + FunctionCallInfoData fcinfo; + InitFunctionCallInfoData(fcinfo, &(peraggstate->collectfn), 2, + peraggstate->aggCollation, + (void *) aggstate, NULL); + fcinfo.arg[1] = pergroupstate->transValue; + fcinfo.argnull[1] = pergroupstate->transValueIsNull; + if (fcinfo.flinfo->fn_strict && + (peraggstate->initCollectValueIsNull || pergroupstate->transValueIsNull)) + { + /* + * We have already checked the collection and transition types are + * binary compatible, so we can just copy the value. + */ + value = pergroupstate->transValue; + isnull = pergroupstate->transValueIsNull; + } + else + { + /* + * copy the initial datum since it might get changed inside the + * collection function + */ + fcinfo.arg[0] = datumCopy(peraggstate->initCollectValue, + peraggstate->collecttypeByVal, + peraggstate->collecttypeLen); + fcinfo.argnull[0] = peraggstate->initCollectValueIsNull; + value = FunctionCallInvoke(&fcinfo); + isnull = fcinfo.isnull; + } + } + else + { + /* No collect function, just use transition values to finalize */ + value = pergroupstate->transValue; + isnull = pergroupstate->transValueIsNull; + } +#else #ifdef PGXC /* * if we skipped the transition phase, we have the collection result in the @@ -919,6 +978,7 @@ finalize_aggregate(AggState *aggstate, pergroupstate->transValueIsNull = pergroupstate->collectValueIsNull; } #endif /* PGXC */ +#endif /* XCP */ /* * Apply the agg's finalfn if one is provided, else return transValue. @@ -930,9 +990,15 @@ finalize_aggregate(AggState *aggstate, InitFunctionCallInfoData(fcinfo, &(peraggstate->finalfn), 1, peraggstate->aggCollation, (void *) aggstate, NULL); +#ifdef XCP + fcinfo.arg[0] = value; + fcinfo.argnull[0] = isnull; + if (fcinfo.flinfo->fn_strict && isnull) +#else fcinfo.arg[0] = pergroupstate->transValue; fcinfo.argnull[0] = pergroupstate->transValueIsNull; if (fcinfo.flinfo->fn_strict && pergroupstate->transValueIsNull) +#endif /* XCP */ { /* don't call a strict function with NULL inputs */ *resultVal = (Datum) 0; @@ -946,8 +1012,13 @@ finalize_aggregate(AggState *aggstate, } else { +#ifdef XCP + *resultVal = value; + *resultIsNull = isnull; +#else *resultVal = pergroupstate->transValue; *resultIsNull = pergroupstate->transValueIsNull; +#endif /* XCP */ } /* @@ -1549,6 +1620,7 @@ agg_retrieve_hash_table(AggState *aggstate) return NULL; } + /* ----------------- * ExecInitAgg * @@ -1586,7 +1658,9 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) aggstate->pergroup = NULL; aggstate->grp_firstTuple = NULL; aggstate->hashtable = NULL; +#ifndef XCP aggstate->skip_trans = node->skip_trans; +#endif /* * Create expression contexts. We need two, one for per-input-tuple @@ -1743,6 +1817,9 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) HeapTuple aggTuple; Form_pg_aggregate aggform; Oid aggtranstype; +#ifdef XCP + Oid aggcollecttype; +#endif /* XCP */ AclResult aclresult; Oid transfn_oid, finalfn_oid; @@ -1819,6 +1896,26 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) peraggstate->finalfn_oid = finalfn_oid = aggform->aggfinalfn; #ifdef PGXC peraggstate->collectfn_oid = collectfn_oid = aggform->aggcollectfn; +#ifdef XCP + /* + * If preparing PHASE1 skip finalization step and return transmission + * value to be collected and finalized on master node. + * If preparing PHASE2 move collection function into transition slot, + * so master node collected transition values and finalithed them. + * Otherwise (one-node aggregation) do all steps locally, the collection + * function will just convert transient value for finalization function. + */ + if (node->aggdistribution == AGG_SLAVE) + { + peraggstate->collectfn_oid = collectfn_oid = InvalidOid; + peraggstate->finalfn_oid = finalfn_oid = InvalidOid; + } + else if (node->aggdistribution == AGG_MASTER) + { + peraggstate->transfn_oid = transfn_oid = collectfn_oid; + peraggstate->collectfn_oid = collectfn_oid = InvalidOid; + } +#else /* * For PGXC final and collection functions are used to combine results at Coordinator, * disable those for Datanode @@ -1828,6 +1925,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) peraggstate->finalfn_oid = finalfn_oid = InvalidOid; peraggstate->collectfn_oid = collectfn_oid = InvalidOid; } +#endif /* XCP */ #endif /* PGXC */ /* Check that aggregate owner has permission to call component fns */ { @@ -1869,6 +1967,15 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) } /* resolve actual type of transition state, if polymorphic */ +#ifdef XCP + /* + * We substitute function for PHASE2 and should take collection type + * as transient + */ + if (node->aggdistribution == AGG_MASTER) + aggtranstype = aggform->aggcollecttype; + else +#endif /* XCP */ aggtranstype = aggform->aggtranstype; if (IsPolymorphicType(aggtranstype)) { @@ -1886,18 +1993,34 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) false); pfree(declaredArgTypes); } - +#ifdef XCP + /* get type of collection state, if defined */ + if (OidIsValid(collectfn_oid)) + aggcollecttype = aggform->aggcollecttype; + else + aggcollecttype = InvalidOid; +#endif /* build expression trees using actual argument & result types */ build_aggregate_fnexprs(inputTypes, numArguments, aggtranstype, +#ifdef XCP + aggcollecttype, +#endif aggref->aggtype, aggref->inputcollid, transfn_oid, +#ifdef XCP + collectfn_oid, +#endif finalfn_oid, &transfnexpr, +#ifdef XCP + &collectfnexpr, +#endif &finalfnexpr); #ifdef PGXC +#ifndef XCP if (OidIsValid(collectfn_oid)) { /* we expect final function expression to be NULL in call to @@ -1923,6 +2046,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) &dummyexpr); Assert(!dummyexpr); } +#endif /* XCP */ #endif /* PGXC */ fmgr_info(transfn_oid, &peraggstate->transfn); @@ -1949,11 +2073,27 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) get_typlenbyval(aggtranstype, &peraggstate->transtypeLen, &peraggstate->transtypeByVal); +#ifdef XCP + if (OidIsValid(aggcollecttype)) + get_typlenbyval(aggcollecttype, + &peraggstate->collecttypeLen, + &peraggstate->collecttypeByVal); +#endif /* XCP */ /* * initval is potentially null, so don't try to access it as a struct * field. Must do it the hard way with SysCacheGetAttr. */ +#ifdef XCP + /* + * If this is Phase2 get collect initial value instead + */ + if (node->aggdistribution == AGG_MASTER) + textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple, + Anum_pg_aggregate_agginitcollect, + &peraggstate->initValueIsNull); + else +#endif /* XCP */ textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple, Anum_pg_aggregate_agginitval, &peraggstate->initValueIsNull); @@ -1970,6 +2110,34 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) * access it as a struct field. Must do it the hard way with * SysCacheGetAttr. */ +#ifdef XCP + if (OidIsValid(aggcollecttype)) + { + textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple, + Anum_pg_aggregate_agginitcollect, + &peraggstate->initCollectValueIsNull); + if (peraggstate->initCollectValueIsNull) + peraggstate->initCollectValue = (Datum) 0; + else + peraggstate->initCollectValue = GetAggInitVal(textInitVal, + aggcollecttype); + /* + * If the collectfn is strict and the initval is NULL, make sure + * transtype and collecttype are the same (or at least + * binary-compatible), so that it's OK to use the transition value + * as the initial collectValue. This should have been checked at agg + * definition time, but just in case... + */ + if (peraggstate->collectfn.fn_strict && peraggstate->initValueIsNull) + { + if (!IsBinaryCoercible(aggtranstype, aggcollecttype)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("aggregate %u needs to have compatible transition type and collection type", + aggref->aggfnoid))); + } + } +#else textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple, Anum_pg_aggregate_agginitcollect, &peraggstate->initCollectValueIsNull); @@ -1979,6 +2147,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) else peraggstate->initCollectValue = GetAggInitVal(textInitVal, aggtranstype); +#endif /* XCP */ #endif /* PGXC */ /* diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 0026364376..b59c847752 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -3,6 +3,11 @@ * nodeModifyTable.c * routines to handle ModifyTable nodes. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -44,9 +49,11 @@ #include "miscadmin.h" #include "nodes/nodeFuncs.h" #ifdef PGXC +#ifndef XCP #include "pgxc/execRemote.h" #include "pgxc/pgxc.h" #endif +#endif #include "storage/bufmgr.h" #include "utils/builtins.h" #include "utils/memutils.h" @@ -173,9 +180,11 @@ ExecInsert(TupleTableSlot *slot, Relation resultRelationDesc; Oid newId; List *recheckIndexes = NIL; -#ifdef PGXC +#ifdef PGXC +#ifndef XCP PlanState *resultRemoteRel = NULL; #endif +#endif /* * get the heap tuple out of the tuple table slot, making sure we have a @@ -188,9 +197,11 @@ ExecInsert(TupleTableSlot *slot, */ resultRelInfo = estate->es_result_relation_info; resultRelationDesc = resultRelInfo->ri_RelationDesc; -#ifdef PGXC +#ifdef PGXC +#ifndef XCP resultRemoteRel = estate->es_result_remoterel; #endif +#endif /* * If the result relation has OIDs, force the tuple's OID to zero so that * heap_insert will assign a fresh OID. Usually the OID already will be @@ -242,9 +253,11 @@ ExecInsert(TupleTableSlot *slot, ExecConstraints(resultRelInfo, slot, estate); #ifdef PGXC +#ifndef XCP if (IS_PGXC_COORDINATOR && resultRemoteRel) { - slot = ExecProcNodeDMLInXC((RemoteQueryState *)resultRemoteRel, slot); + ExecRemoteQueryStandard(resultRelationDesc, (RemoteQueryState *)resultRemoteRel, slot); + /* * PGXCTODO: If target table uses WITH OIDS, this should be set to the Oid inserted * but Oids are not consistent among nodes in Postgres-XC, so this is set to the @@ -254,7 +267,8 @@ ExecInsert(TupleTableSlot *slot, newId = InvalidOid; } else -#endif +#endif +#endif { /* * insert the tuple @@ -276,55 +290,6 @@ ExecInsert(TupleTableSlot *slot, if (canSetTag) { -#ifdef PGXC - if (IS_PGXC_COORDINATOR && resultRelInfo->ri_projectReturning) - { - /* - * Consider this example - * - * CREATE TABLE bar(c3 int, c4 int); - * INSERT INTO bar VALUES(123,456); - * INSERT INTO bar VALUES(123,789); - * - * CREATE TABLE foo (c1 int, c2 int); - * INSERT INTO foo VALUES (1,2), (3,4); - * Consider this join query - * select f.ctid, b.ctid, * from foo f, bar b where f.c1+122=b.c3; - * Note it returned TWO rows - * ctid | ctid | c1 | c2 | c3 | c4 - * -------+-------+----+----+-----+----- - * (0,1) | (0,1) | 1 | 2 | 123 | 456 - * (0,1) | (0,2) | 1 | 2 | 123 | 789 - * (2 rows) - * - * Now consider the update with the same join condition - * - * update foo set c2=c2*2 from bar b - * WHERE foo.c1+122 = b.c3 RETURNING *, foo.ctid; - * - * The update would run twice since we got two rows from the join. - * When the first update runs it will change the ctid of the row - * to be updated and would return the updated row with ctid say (0,3). - * The second update would not update any row since the row with - * ctid (0,1) would no more exist in foo, it would therefore return - * an empty slot. - * - * update foo set c2=c2*2 from bar b - * WHERE foo.c1+122 = b.c3 RETURNING *, foo.ctid; - * f1 | f2 | f3 | q1 | q2 | ctid - * ----+------+----+-----+------------------+------- - * 1 | test | 84 | 123 | 4567890123456789 | (0,3) - * (1 row) - * - * It is therefore possible in ExecInsert/Update/Delete - * to receive an empty slot, and we have to add checks - * before we can update the processed tuple count. - */ - if (!TupIsNull(slot)) - (estate->es_processed)++; - } - else -#endif (estate->es_processed)++; estate->es_lastoid = newId; setLastTid(&(tuple->t_self)); @@ -337,16 +302,8 @@ ExecInsert(TupleTableSlot *slot, /* Process RETURNING if present */ if (resultRelInfo->ri_projectReturning) -#ifdef PGXC - { - if (TupIsNull(slot)) - return NULL; -#endif return ExecProcessReturning(resultRelInfo->ri_projectReturning, slot, planSlot); -#ifdef PGXC - } -#endif return NULL; } @@ -379,8 +336,9 @@ ExecDelete(ItemPointer tupleid, ItemPointerData update_ctid; TransactionId update_xmax; #ifdef PGXC +#ifndef XCP PlanState *resultRemoteRel = NULL; - TupleTableSlot *slot; +#endif #endif /* @@ -389,8 +347,10 @@ ExecDelete(ItemPointer tupleid, resultRelInfo = estate->es_result_relation_info; resultRelationDesc = resultRelInfo->ri_RelationDesc; #ifdef PGXC +#ifndef XCP resultRemoteRel = estate->es_result_remoterel; #endif +#endif /* BEFORE ROW DELETE Triggers */ if (resultRelInfo->ri_TrigDesc && @@ -439,13 +399,15 @@ ExecDelete(ItemPointer tupleid, */ ldelete:; #ifdef PGXC +#ifndef XCP if (IS_PGXC_COORDINATOR && resultRemoteRel) { - slot = ExecProcNodeDMLInXC((RemoteQueryState *)resultRemoteRel, planSlot); + ExecRemoteQueryStandard(resultRelationDesc, (RemoteQueryState *)resultRemoteRel, planSlot); } else { #endif +#endif result = heap_delete(resultRelationDesc, tupleid, &update_ctid, &update_xmax, estate->es_output_cid, @@ -499,48 +461,27 @@ ldelete:; */ #ifdef PGXC +#ifndef XCP } #endif +#endif } if (canSetTag) -#ifdef PGXC - { - if (IS_PGXC_COORDINATOR && resultRelInfo->ri_projectReturning) - { - /* For reason see comments in ExecInsert */ - if (!TupIsNull(slot)) - (estate->es_processed)++; - } - else -#endif (estate->es_processed)++; -#ifdef PGXC - } -#endif #ifdef PGXC +#ifndef XCP /* * Do not fire triggers on remote relation, it would not find old tuple */ if (resultRemoteRel == NULL) #endif +#endif /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, tupleid); /* Process RETURNING if present */ -#ifdef PGXC - if (resultRelInfo->ri_projectReturning && resultRemoteRel != NULL && - IS_PGXC_COORDINATOR && !IsConnFromCoord()) - { - if (TupIsNull(slot)) - return NULL; - - return ExecProcessReturning(resultRelInfo->ri_projectReturning, - slot, planSlot); - } - else -#endif if (resultRelInfo->ri_projectReturning) { /* @@ -623,8 +564,10 @@ ExecUpdate(ItemPointer tupleid, TransactionId update_xmax; List *recheckIndexes = NIL; #ifdef PGXC +#ifndef XCP PlanState *resultRemoteRel = NULL; #endif +#endif /* * abort the operation if not running transactions @@ -644,8 +587,10 @@ ExecUpdate(ItemPointer tupleid, resultRelInfo = estate->es_result_relation_info; resultRelationDesc = resultRelInfo->ri_RelationDesc; #ifdef PGXC +#ifndef XCP resultRemoteRel = estate->es_result_remoterel; #endif +#endif /* BEFORE ROW UPDATE Triggers */ if (resultRelInfo->ri_TrigDesc && @@ -701,13 +646,15 @@ lreplace:; ExecConstraints(resultRelInfo, slot, estate); #ifdef PGXC +#ifndef XCP if (IS_PGXC_COORDINATOR && resultRemoteRel) { - slot = ExecProcNodeDMLInXC((RemoteQueryState *)resultRemoteRel, planSlot); + ExecRemoteQueryStandard(resultRelationDesc, (RemoteQueryState *)resultRemoteRel, planSlot); } else { #endif +#endif /* * replace the heap tuple * @@ -782,32 +729,23 @@ lreplace:; recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), estate); #ifdef PGXC +#ifndef XCP } #endif +#endif } if (canSetTag) -#ifdef PGXC - { - if (IS_PGXC_COORDINATOR && resultRelInfo->ri_projectReturning) - { - /* For reason see comments in ExecInsert */ - if (!TupIsNull(slot)) - (estate->es_processed)++; - } - else -#endif (estate->es_processed)++; -#ifdef PGXC - } -#endif #ifdef PGXC +#ifndef XCP /* * Do not fire triggers on remote relation, it would not find old tuple */ if (resultRemoteRel == NULL) #endif +#endif /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(estate, resultRelInfo, tupleid, tuple, recheckIndexes); @@ -816,16 +754,8 @@ lreplace:; /* Process RETURNING if present */ if (resultRelInfo->ri_projectReturning) -#ifdef PGXC - { - if (TupIsNull(slot)) - return NULL; -#endif return ExecProcessReturning(resultRelInfo->ri_projectReturning, slot, planSlot); -#ifdef PGXC - } -#endif return NULL; } @@ -893,10 +823,12 @@ ExecModifyTable(ModifyTableState *node) ResultRelInfo *saved_resultRelInfo; ResultRelInfo *resultRelInfo; PlanState *subplanstate; -#ifdef PGXC +#ifdef PGXC +#ifndef XCP PlanState *remoterelstate; PlanState *saved_resultRemoteRel; -#endif +#endif +#endif JunkFilter *junkfilter; TupleTableSlot *slot; TupleTableSlot *planSlot; @@ -938,9 +870,10 @@ ExecModifyTable(ModifyTableState *node) resultRelInfo = node->resultRelInfo + node->mt_whichplan; subplanstate = node->mt_plans[node->mt_whichplan]; #ifdef PGXC - /* Initialize remote plan state */ +#ifndef XCP remoterelstate = node->mt_remoterels[node->mt_whichplan]; #endif +#endif junkfilter = resultRelInfo->ri_junkFilter; /* @@ -952,13 +885,17 @@ ExecModifyTable(ModifyTableState *node) */ saved_resultRelInfo = estate->es_result_relation_info; #ifdef PGXC +#ifndef XCP saved_resultRemoteRel = estate->es_result_remoterel; #endif +#endif estate->es_result_relation_info = resultRelInfo; #ifdef PGXC +#ifndef XCP estate->es_result_remoterel = remoterelstate; #endif +#endif /* * Fetch rows from subplan(s), and execute the required table modification @@ -985,9 +922,11 @@ ExecModifyTable(ModifyTableState *node) resultRelInfo++; subplanstate = node->mt_plans[node->mt_whichplan]; #ifdef PGXC +#ifndef XCP /* Move to next remote plan */ estate->es_result_remoterel = node->mt_remoterels[node->mt_whichplan]; - remoterelstate = node->mt_remoterels[node->mt_whichplan]; + remoterelstate = node->mt_plans[node->mt_whichplan]; +#endif #endif junkfilter = resultRelInfo->ri_junkFilter; estate->es_result_relation_info = resultRelInfo; @@ -1045,9 +984,7 @@ ExecModifyTable(ModifyTableState *node) if (operation != CMD_DELETE) slot = ExecFilterJunk(junkfilter, slot); } -#ifdef PGXC - estate->es_result_remoterel = remoterelstate; -#endif + switch (operation) { case CMD_INSERT: @@ -1073,18 +1010,17 @@ ExecModifyTable(ModifyTableState *node) if (slot) { estate->es_result_relation_info = saved_resultRelInfo; -#ifdef PGXC - estate->es_result_remoterel = saved_resultRemoteRel; -#endif return slot; } } /* Restore es_result_relation_info before exiting */ estate->es_result_relation_info = saved_resultRelInfo; -#ifdef PGXC +#ifdef PGXC +#ifndef XCP estate->es_result_remoterel = saved_resultRemoteRel; -#endif +#endif +#endif /* * We're done, but fire AFTER STATEMENT triggers before exiting. @@ -1112,9 +1048,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) Plan *subplan; ListCell *l; int i; -#ifdef PGXC - PlanState *saved_remoteRelInfo; -#endif /* check for unsupported flags */ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); @@ -1132,9 +1065,11 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) mtstate->mt_done = false; mtstate->mt_plans = (PlanState **) palloc0(sizeof(PlanState *) * nplans); -#ifdef PGXC +#ifdef PGXC +#ifndef XCP mtstate->mt_remoterels = (PlanState **) palloc0(sizeof(PlanState *) * nplans); -#endif +#endif +#endif mtstate->resultRelInfo = estate->es_result_relations + node->resultRelIndex; mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans); mtstate->mt_nplans = nplans; @@ -1152,24 +1087,25 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * sub-plan; ExecContextForcesOids depends on that! */ saved_resultRelInfo = estate->es_result_relation_info; -#ifdef PGXC - saved_remoteRelInfo = estate->es_result_remoterel; -#endif resultRelInfo = mtstate->resultRelInfo; i = 0; foreach(l, node->plans) { -#ifdef PGXC +#ifdef PGXC +#ifndef XCP Plan *remoteplan = NULL; -#endif +#endif +#endif subplan = (Plan *) lfirst(l); -#ifdef PGXC +#ifdef PGXC +#ifndef XCP if (node->remote_plans) remoteplan = list_nth(node->remote_plans, i); -#endif +#endif +#endif /* * Verify result relation is a valid target for the current operation @@ -1195,24 +1131,23 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) mtstate->mt_plans[i] = ExecInitNode(subplan, estate, eflags); #ifdef PGXC +#ifndef XCP if (remoteplan) { - /* + /* * Init the plan for the remote execution for this result rel. This is * used to execute data modification queries on the remote nodes */ mtstate->mt_remoterels[i] = ExecInitNode(remoteplan, estate, eflags); } -#endif +#endif +#endif resultRelInfo++; i++; } estate->es_result_relation_info = saved_resultRelInfo; -#ifdef PGXC - estate->es_result_remoterel = saved_remoteRelInfo; -#endif /* * Initialize RETURNING projections if needed. diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c index 9072581c05..bc27d861ad 100644 --- a/src/backend/executor/nodeSort.c +++ b/src/backend/executor/nodeSort.c @@ -82,40 +82,6 @@ ExecSort(SortState *node) outerNode = outerPlanState(node); tupDesc = ExecGetResultType(outerNode); -#ifdef PGXC - if (plannode->srt_start_merge && - IsA(node->ss.ps.lefttree, RemoteQueryState)) - { - RemoteQueryState *rqs = (RemoteQueryState *)node->ss.ps.lefttree; - - rqs->rqs_for_sort = true; - /* - * Start the queries on all the nodes. That way we get the number of - * connections and connection handlers set in RemoteQueryState. - * Those will be used to merge the data from the datanodes. - */ - if (!rqs->query_Done) - { - do_query(rqs); - rqs->query_Done = true; - } - - /* - * PGXCTODO: We don't handle bounded in this case, but see if it can - * be used. - */ - tuplesortstate = tuplesort_begin_merge(tupDesc, - plannode->numCols, - plannode->sortColIdx, - plannode->sortOperators, - plannode->collations, - plannode->nullsFirst, - rqs, work_mem); - - } - else - { -#endif /* PGXC */ tuplesortstate = tuplesort_begin_heap(tupDesc, plannode->numCols, plannode->sortColIdx, @@ -126,15 +92,8 @@ ExecSort(SortState *node) node->randomAccess); if (node->bounded) tuplesort_set_bound(tuplesortstate, node->bound); -#ifdef PGXC - } -#endif /* PGXC */ node->tuplesortstate = (void *) tuplesortstate; -#ifdef PGXC - if (!plannode->srt_start_merge) - { -#endif /* PGXC */ /* * Scan the subplan and feed all the tuples to tuplesort. */ @@ -153,11 +112,6 @@ ExecSort(SortState *node) * Complete the sort. */ tuplesort_performsort(tuplesortstate); -#ifdef PGXC - } - else - Assert(IsA(node->ss.ps.lefttree, RemoteQueryState)); -#endif /* PGXC */ /* * restore to user specified direction diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c index da31820e2d..f6fb1c955f 100644 --- a/src/backend/executor/nodeSubplan.c +++ b/src/backend/executor/nodeSubplan.c @@ -3,6 +3,11 @@ * nodeSubplan.c * routines to support subselects * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -660,6 +665,11 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent) sstate->planstate = (PlanState *) list_nth(estate->es_subplanstates, subplan->plan_id - 1); +#ifdef XCP + /* subplan is referenced on local node, finish initialization */ + ExecFinishInitProcNode(sstate->planstate); +#endif + /* Initialize subexpressions */ sstate->testexpr = ExecInitExpr((Expr *) subplan->testexpr, parent); sstate->args = (List *) ExecInitExpr((Expr *) subplan->args, parent); diff --git a/src/backend/executor/nodeWindowAgg.c b/src/backend/executor/nodeWindowAgg.c index ce89ff2ebf..43d4581cd1 100644 --- a/src/backend/executor/nodeWindowAgg.c +++ b/src/backend/executor/nodeWindowAgg.c @@ -23,6 +23,11 @@ * aggregate function over all rows in the current row's window frame. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -1717,10 +1722,19 @@ initialize_peragg(WindowAggState *winstate, WindowFunc *wfunc, HeapTuple aggTuple; Form_pg_aggregate aggform; Oid aggtranstype; +#ifdef XCP + Oid aggcollecttype; +#endif AclResult aclresult; Oid transfn_oid, +#ifdef XCP + collectfn_oid, +#endif finalfn_oid; Expr *transfnexpr, +#ifdef XCP + *collectfnexpr, +#endif *finalfnexpr; Datum textInitVal; int i; @@ -1746,6 +1760,9 @@ initialize_peragg(WindowAggState *winstate, WindowFunc *wfunc, */ peraggstate->transfn_oid = transfn_oid = aggform->aggtransfn; +#ifdef XCP + collectfn_oid = aggform->aggcollectfn; +#endif peraggstate->finalfn_oid = finalfn_oid = aggform->aggfinalfn; /* Check that aggregate owner has permission to call component fns */ @@ -1794,16 +1811,28 @@ initialize_peragg(WindowAggState *winstate, WindowFunc *wfunc, false); pfree(declaredArgTypes); } +#ifdef XCP + aggcollecttype = aggform->aggcollecttype; +#endif /* build expression trees using actual argument & result types */ build_aggregate_fnexprs(inputTypes, numArguments, aggtranstype, +#ifdef XCP + aggcollecttype, +#endif wfunc->wintype, wfunc->inputcollid, transfn_oid, +#ifdef XCP + collectfn_oid, +#endif finalfn_oid, &transfnexpr, +#ifdef XCP + &collectfnexpr, +#endif &finalfnexpr); fmgr_info(transfn_oid, &peraggstate->transfn); diff --git a/src/backend/executor/producerReceiver.c b/src/backend/executor/producerReceiver.c new file mode 100644 index 0000000000..b7339f16c6 --- /dev/null +++ b/src/backend/executor/producerReceiver.c @@ -0,0 +1,290 @@ +/*------------------------------------------------------------------------- + * + * producerReceiver.c + * An implementation of DestReceiver that distributes the result tuples to + * multiple customers via a SharedQueue. + * + * + * Copyright (c) 2012-2014, TransLattice, Inc. + * + * IDENTIFICATION + * src/backend/executor/producerReceiver.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "executor/producerReceiver.h" +#include "pgxc/nodemgr.h" +#include "tcop/pquery.h" +#include "utils/tuplestore.h" + +typedef struct +{ + DestReceiver pub; + /* parameters: */ + DestReceiver *consumer; /* where to put the tuples for self */ + AttrNumber distKey; /* distribution key attribute in the tuple */ + Locator *locator; /* locator is determining destination nodes */ + int *distNodes; /* array where to get locator results */ + int *consMap; /* map of consumers: consMap[node-1] indicates + * the target consumer */ + SharedQueue squeue; /* a SharedQueue for result distribution */ + MemoryContext tmpcxt; /* holds temporary data */ + Tuplestorestate **tstores; /* storage to buffer data if destination queue + * is full */ + TupleDesc typeinfo; /* description of received tuples */ + long tcount; + long selfcount; + long othercount; +} ProducerState; + + +/* + * Prepare to receive tuples from executor. + */ +static void +producerStartupReceiver(DestReceiver *self, int operation, TupleDesc typeinfo) +{ + ProducerState *myState = (ProducerState *) self; + + if (ActivePortal) + { + /* Normally ExecutorContext is current here. However we should better + * create local producer storage in the Portal's context: producer + * may keep pushing records to consumers after executor is destroyed. + */ + MemoryContext savecontext; + savecontext = MemoryContextSwitchTo(PortalGetHeapMemory(ActivePortal)); + myState->typeinfo = CreateTupleDescCopy(typeinfo); + MemoryContextSwitchTo(savecontext); + } + else + myState->typeinfo = typeinfo; + + if (myState->consumer) + (*myState->consumer->rStartup) (myState->consumer, operation, typeinfo); +} + +/* + * Receive a tuple from the executor and dispatch it to the proper consumer + */ +static void +producerReceiveSlot(TupleTableSlot *slot, DestReceiver *self) +{ + ProducerState *myState = (ProducerState *) self; + Datum value; + bool isnull; + int ncount, i; + + if (myState->distKey == InvalidAttrNumber) + { + value = (Datum) 0; + isnull = true; + } + else + value = slot_getattr(slot, myState->distKey, &isnull); + ncount = GET_NODES(myState->locator, value, isnull, NULL); + + myState->tcount++; + /* Dispatch the tuple */ + for (i = 0; i < ncount; i++) + { + int consumerIdx = myState->distNodes[i]; + + if (consumerIdx == SQ_CONS_NONE) + { + continue; + } + else if (consumerIdx == SQ_CONS_SELF) + { + Assert(myState->consumer); + (*myState->consumer->receiveSlot) (slot, myState->consumer); + myState->selfcount++; + } + else if (myState->squeue) + { + /* + * If the tuple will not fit to the consumer queue it will be stored + * in the local tuplestore. The tuplestore should be in the portal + * context, because ExecutorContext may be destroyed when tuples + * are not yet pushed to the consumer queue. + */ + MemoryContext savecontext; + Assert(ActivePortal); + savecontext = MemoryContextSwitchTo(PortalGetHeapMemory(ActivePortal)); + SharedQueueWrite(myState->squeue, consumerIdx, slot, + &myState->tstores[consumerIdx], myState->tmpcxt); + MemoryContextSwitchTo(savecontext); + myState->othercount++; + } + } +} + + +/* + * Clean up at end of an executor run + */ +static void +producerShutdownReceiver(DestReceiver *self) +{ + ProducerState *myState = (ProducerState *) self; + + if (myState->consumer) + (*myState->consumer->rShutdown) (myState->consumer); +} + + +/* + * Destroy receiver when done with it + */ +static void +producerDestroyReceiver(DestReceiver *self) +{ + ProducerState *myState = (ProducerState *) self; + + elog(LOG, "Producer stats: total %ld tuples, %ld tuples to self, %ld to other nodes", + myState->tcount, myState->selfcount, myState->othercount); + + if (myState->consumer) + (*myState->consumer->rDestroy) (myState->consumer); + + /* Make sure all data are in the squeue */ + while (myState->tstores) + { + if (SharedQueueFinish(myState->squeue, myState->typeinfo, + myState->tstores) == 0) + { + pfree(myState->tstores); + myState->tstores = NULL; + } + else + { + pg_usleep(10000l); + /* + * Do not wait for consumers that was not even connected after 10 + * seconds after start waiting for their disconnection. + * That should help to break the loop which would otherwise endless. + * The error will be emitted later in SharedQueueUnBind + */ + SharedQueueResetNotConnected(myState->squeue); + } + } + + /* wait while consumer are finishing and release shared resources */ + if (myState->squeue) + SharedQueueUnBind(myState->squeue); + myState->squeue = NULL; + + /* Release workspace if any */ + if (myState->locator) + freeLocator(myState->locator); + pfree(myState); +} + + +/* + * Initially create a DestReceiver object. + */ +DestReceiver * +CreateProducerDestReceiver(void) +{ + ProducerState *self = (ProducerState *) palloc0(sizeof(ProducerState)); + + self->pub.receiveSlot = producerReceiveSlot; + self->pub.rStartup = producerStartupReceiver; + self->pub.rShutdown = producerShutdownReceiver; + self->pub.rDestroy = producerDestroyReceiver; + self->pub.mydest = DestProducer; + + /* private fields will be set by SetTuplestoreDestReceiverParams */ + self->tcount = 0; + self->selfcount = 0; + self->othercount = 0; + + return (DestReceiver *) self; +} + + +/* + * Set parameters for a ProducerDestReceiver + */ +void +SetProducerDestReceiverParams(DestReceiver *self, + AttrNumber distKey, + Locator *locator, + SharedQueue squeue) +{ + ProducerState *myState = (ProducerState *) self; + + Assert(myState->pub.mydest == DestProducer); + myState->distKey = distKey; + myState->locator = locator; + myState->squeue = squeue; + myState->typeinfo = NULL; + myState->tmpcxt = NULL; + /* Create workspace */ + myState->distNodes = (int *) getLocatorResults(locator); + if (squeue) + myState->tstores = (Tuplestorestate **) + palloc0(NumDataNodes * sizeof(Tuplestorestate *)); +} + + +/* + * Set a DestReceiver to receive tuples targeted to "self". + * Returns old value of the self consumer + */ +DestReceiver * +SetSelfConsumerDestReceiver(DestReceiver *self, + DestReceiver *consumer) +{ + ProducerState *myState = (ProducerState *) self; + DestReceiver *oldconsumer; + + Assert(myState->pub.mydest == DestProducer); + oldconsumer = myState->consumer; + myState->consumer = consumer; + return oldconsumer; +} + + +/* + * Set a memory context to hold temporary data + */ +void +SetProducerTempMemory(DestReceiver *self, MemoryContext tmpcxt) +{ + ProducerState *myState = (ProducerState *) self; + DestReceiver *oldconsumer; + + Assert(myState->pub.mydest == DestProducer); + myState->tmpcxt = tmpcxt; +} + + +/* + * Push data from the local tuplestores to the shared memory so consumers can + * read them. Returns true if all data are pushed, false if something remains + * in the tuplestores yet. + */ +bool +ProducerReceiverPushBuffers(DestReceiver *self) +{ + ProducerState *myState = (ProducerState *) self; + + Assert(myState->pub.mydest == DestProducer); + if (myState->tstores) + { + if (SharedQueueFinish(myState->squeue, myState->typeinfo, + myState->tstores) == 0) + { + pfree(myState->tstores); + myState->tstores = NULL; + } + else + return false; + } + return true; +} diff --git a/src/backend/libpq/be-fsstubs.c b/src/backend/libpq/be-fsstubs.c index 0bc90cb59a..dbdb8be2b6 100644 --- a/src/backend/libpq/be-fsstubs.c +++ b/src/backend/libpq/be-fsstubs.c @@ -101,11 +101,18 @@ lo_open(PG_FUNCTION_ARGS) int fd; #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif #if FSDB elog(DEBUG4, "lo_open(%u,%d)", lobjId, mode); @@ -134,11 +141,18 @@ lo_close(PG_FUNCTION_ARGS) int32 fd = PG_GETARG_INT32(0); #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif if (fd < 0 || fd >= cookies_size || cookies[fd] == NULL) ereport(ERROR, @@ -171,11 +185,18 @@ lo_read(int fd, char *buf, int len) int status; #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif if (fd < 0 || fd >= cookies_size || cookies[fd] == NULL) ereport(ERROR, @@ -204,11 +225,18 @@ lo_write(int fd, const char *buf, int len) int status; #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif if (fd < 0 || fd >= cookies_size || cookies[fd] == NULL) ereport(ERROR, @@ -247,11 +275,18 @@ lo_lseek(PG_FUNCTION_ARGS) int status; #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif if (fd < 0 || fd >= cookies_size || cookies[fd] == NULL) ereport(ERROR, @@ -269,11 +304,18 @@ lo_creat(PG_FUNCTION_ARGS) Oid lobjId; #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif /* * We don't actually need to store into fscxt, but create it anyway to @@ -292,11 +334,18 @@ lo_create(PG_FUNCTION_ARGS) Oid lobjId = PG_GETARG_OID(0); #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif /* * We don't actually need to store into fscxt, but create it anyway to @@ -315,11 +364,18 @@ lo_tell(PG_FUNCTION_ARGS) int32 fd = PG_GETARG_INT32(0); #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif if (fd < 0 || fd >= cookies_size || cookies[fd] == NULL) ereport(ERROR, @@ -335,11 +391,18 @@ lo_unlink(PG_FUNCTION_ARGS) Oid lobjId = PG_GETARG_OID(0); #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif /* Must be owner of the largeobject */ if (!lo_compat_privileges && @@ -385,11 +448,18 @@ loread(PG_FUNCTION_ARGS) int totalread; #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif if (len < 0) len = 0; @@ -410,11 +480,18 @@ lowrite(PG_FUNCTION_ARGS) int totalwritten; #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif bytestowrite = VARSIZE(wbuf) - VARHDRSZ; totalwritten = lo_write(fd, VARDATA(wbuf), bytestowrite); @@ -435,11 +512,18 @@ lo_import(PG_FUNCTION_ARGS) text *filename = PG_GETARG_TEXT_PP(0); #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif PG_RETURN_OID(lo_import_internal(filename, InvalidOid)); } @@ -455,11 +539,18 @@ lo_import_with_oid(PG_FUNCTION_ARGS) Oid oid = PG_GETARG_OID(1); #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif PG_RETURN_OID(lo_import_internal(filename, oid)); } @@ -542,11 +633,18 @@ lo_export(PG_FUNCTION_ARGS) mode_t oumask; #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif #ifndef ALLOW_DANGEROUS_LO_FUNCTIONS if (!superuser()) @@ -611,11 +709,18 @@ lo_truncate(PG_FUNCTION_ARGS) int32 len = PG_GETARG_INT32(1); #ifdef PGXC +#ifdef XCP + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Postgres-XL does not yet support large objects"), + errdetail("The feature is not currently supported"))); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Postgres-XC does not support large object yet"), errdetail("The feature is not currently supported"))); #endif +#endif if (fd < 0 || fd >= cookies_size || cookies[fd] == NULL) ereport(ERROR, diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c index 828f6dcc8e..7a84dc24d5 100644 --- a/src/backend/libpq/hba.c +++ b/src/backend/libpq/hba.c @@ -5,6 +5,11 @@ * wherein you authenticate a user by seeing what IP address the system * says he comes from and choosing authentication method based on it). * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -2092,3 +2097,91 @@ hba_getauthmethod(hbaPort *port) { check_hba(port); } + +#ifdef XCP +/* + * NB the only way to free allocated lines is to reset or delete current memory + * context, so caller is responsible for setting it up properly to avoid leak. + * However, if function fails it would release working memory. + * Basically the function does the same as load_hba(), but it does not set + * the static variables. + */ +List* get_parsed_hba(void) { + FILE *file; + List *hba_lines = NIL; + List *hba_line_nums = NIL; + ListCell *line, + *line_num; + List *new_parsed_lines = NIL; + bool ok = true; + MemoryContext linecxt; + MemoryContext oldcxt; + MemoryContext hbacxt; + + file = AllocateFile(HbaFileName, "r"); + if (file == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open configuration file \"%s\": %m", + HbaFileName))); + + /* + * Caller will take care of making this a FATAL error in case this is + * the initial startup. If it happens on reload, we just keep the old + * version around. + */ + return false; + } + + linecxt = tokenize_file(HbaFileName, file, &hba_lines, &hba_line_nums); + FreeFile(file); + + /* Now parse all the lines */ + hbacxt = AllocSetContextCreate(CurrentMemoryContext, + "hba parser context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcxt = MemoryContextSwitchTo(hbacxt); + forboth(line, hba_lines, line_num, hba_line_nums) + { + HbaLine *newline; + + if ((newline = parse_hba_line(lfirst(line), lfirst_int(line_num))) == NULL) + { + /* + * Parse error in the file, so indicate there's a problem. NB: a + * problem in a line will free the memory for all previous lines as + * well! + */ + MemoryContextReset(hbacxt); + new_parsed_lines = NIL; + ok = false; + + /* + * Keep parsing the rest of the file so we can report errors on + * more than the first row. Error has already been reported in the + * parsing function, so no need to log it here. + */ + continue; + } + + new_parsed_lines = lappend(new_parsed_lines, newline); + } + + /* Free tokenizer memory */ + MemoryContextDelete(linecxt); + MemoryContextSwitchTo(oldcxt); + + if (!ok) + { + /* Parsing failed at one or more rows, so bail out */ + MemoryContextDelete(hbacxt); + return NIL; + } + + /* Loaded new file successfully, return */ + return parsed_hba_lines; +} +#endif diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 483a956434..65091479a4 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -328,6 +328,7 @@ help(const char *progname) printf(_("\nNode options:\n")); printf(_(" --coordinator start as a Coordinator\n")); printf(_(" --datanode start as a Datanode\n")); + printf(_(" --restoremode start to restore existing schema on the new node to be added\n")); #endif printf(_("\nPlease read the documentation for the complete list of run-time\n" diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 7eeb5b9af8..46c9940bd7 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -11,6 +11,11 @@ * be handled easily in a simple depth-first traversal. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -28,7 +33,10 @@ #include "nodes/relation.h" #ifdef PGXC #include "pgxc/locator.h" -#include "optimizer/pgxcplan.h" +#include "pgxc/planner.h" +#endif +#ifdef XCP +#include "pgxc/execRemote.h" #endif #include "utils/datum.h" @@ -98,7 +106,16 @@ _copyPlannedStmt(const PlannedStmt *from) COPY_NODE_FIELD(relationOids); COPY_NODE_FIELD(invalItems); COPY_SCALAR_FIELD(nParamExec); - +#ifdef XCP + COPY_SCALAR_FIELD(nParamRemote); + COPY_POINTER_FIELD(remoteparams, + newnode->nParamRemote * sizeof(RemoteParam)); + COPY_STRING_FIELD(pname); + COPY_SCALAR_FIELD(distributionType); + COPY_SCALAR_FIELD(distributionKey); + COPY_NODE_FIELD(distributionNodes); + COPY_NODE_FIELD(distributionRestrict); +#endif return newnode; } @@ -187,8 +204,10 @@ _copyModifyTable(const ModifyTable *from) COPY_NODE_FIELD(rowMarks); COPY_SCALAR_FIELD(epqParam); #ifdef PGXC +#ifndef XCP COPY_NODE_FIELD(remote_plans); #endif +#endif return newnode; } @@ -781,6 +800,9 @@ _copyAgg(const Agg *from) CopyPlanFields((const Plan *) from, (Plan *) newnode); COPY_SCALAR_FIELD(aggstrategy); +#ifdef XCP + COPY_SCALAR_FIELD(aggdistribution); +#endif COPY_SCALAR_FIELD(numCols); if (from->numCols > 0) { @@ -1023,27 +1045,32 @@ _copyRemoteQuery(const RemoteQuery *from) COPY_STRING_FIELD(sql_statement); COPY_NODE_FIELD(exec_nodes); COPY_SCALAR_FIELD(combine_type); + COPY_NODE_FIELD(sort); COPY_SCALAR_FIELD(read_only); COPY_SCALAR_FIELD(force_autocommit); COPY_STRING_FIELD(statement); COPY_STRING_FIELD(cursor); - COPY_SCALAR_FIELD(rq_num_params); - if (from->rq_param_types) - COPY_POINTER_FIELD(rq_param_types, - sizeof(from->rq_param_types[0]) * from->rq_num_params); - else - newnode->rq_param_types = NULL; + COPY_SCALAR_FIELD(remote_num_params); + COPY_POINTER_FIELD(remote_param_types, + sizeof(from->remote_param_types[0]) * from->remote_num_params); COPY_SCALAR_FIELD(exec_type); +#ifndef XCP COPY_SCALAR_FIELD(is_temp); - COPY_SCALAR_FIELD(rq_finalise_aggs); - COPY_SCALAR_FIELD(rq_sortgroup_colno); - COPY_NODE_FIELD(remote_query); +#endif + + COPY_SCALAR_FIELD(reduce_level); COPY_NODE_FIELD(base_tlist); - COPY_NODE_FIELD(coord_var_tlist); - COPY_NODE_FIELD(query_var_tlist); + COPY_STRING_FIELD(outer_alias); + COPY_STRING_FIELD(inner_alias); + COPY_SCALAR_FIELD(outer_reduce_level); + COPY_SCALAR_FIELD(inner_reduce_level); + COPY_BITMAPSET_FIELD(outer_relids); + COPY_BITMAPSET_FIELD(inner_relids); + COPY_STRING_FIELD(inner_statement); + COPY_STRING_FIELD(outer_statement); + COPY_STRING_FIELD(join_condition); COPY_SCALAR_FIELD(has_row_marks); COPY_SCALAR_FIELD(has_ins_child_sel_parent); - COPY_SCALAR_FIELD(rq_params_internal); return newnode; } @@ -1079,6 +1106,7 @@ _copySimpleSort(const SimpleSort *from) { COPY_POINTER_FIELD(sortColIdx, from->numCols * sizeof(AttrNumber)); COPY_POINTER_FIELD(sortOperators, from->numCols * sizeof(Oid)); + COPY_POINTER_FIELD(sortCollations, from->numCols * sizeof(Oid)); COPY_POINTER_FIELD(nullsFirst, from->numCols * sizeof(bool)); } @@ -1086,6 +1114,55 @@ _copySimpleSort(const SimpleSort *from) } #endif + +#ifdef XCP +/* + * _copyRemoteSubplan + */ +static RemoteSubplan * +_copyRemoteSubplan(const RemoteSubplan *from) +{ + RemoteSubplan *newnode = makeNode(RemoteSubplan); + + /* + * copy node superclass fields + */ + CopyScanFields((Scan *) from, (Scan *) newnode); + + /* + * copy remainder of node + */ + COPY_SCALAR_FIELD(distributionType); + COPY_SCALAR_FIELD(distributionKey); + COPY_NODE_FIELD(distributionNodes); + COPY_NODE_FIELD(distributionRestrict); + COPY_NODE_FIELD(nodeList); + COPY_SCALAR_FIELD(execOnAll); + COPY_NODE_FIELD(sort); + COPY_STRING_FIELD(cursor); + COPY_SCALAR_FIELD(unique); + + return newnode; +} + +/* + * _copyDistribution + */ +static Distribution * +_copyDistribution(const Distribution *from) +{ + Distribution *newnode = makeNode(Distribution); + + COPY_SCALAR_FIELD(distributionType); + COPY_NODE_FIELD(distributionExpr); + COPY_BITMAPSET_FIELD(nodes); + COPY_BITMAPSET_FIELD(restrictNodes); + + return newnode; +} +#endif + + /* **************************************************************** * primnodes.h copy functions * **************************************************************** @@ -1241,8 +1318,10 @@ _copyAggref(const Aggref *from) COPY_SCALAR_FIELD(aggfnoid); COPY_SCALAR_FIELD(aggtype); #ifdef PGXC +#ifndef XCP COPY_SCALAR_FIELD(aggtrantype); COPY_SCALAR_FIELD(agghas_collectfn); +#endif /* XCP */ #endif /* PGXC */ COPY_SCALAR_FIELD(aggcollid); COPY_SCALAR_FIELD(inputcollid); @@ -2069,8 +2148,10 @@ _copyRangeTblEntry(const RangeTblEntry *from) COPY_SCALAR_FIELD(rtekind); #ifdef PGXC +#ifndef XCP COPY_STRING_FIELD(relname); #endif +#endif COPY_SCALAR_FIELD(relid); COPY_SCALAR_FIELD(relkind); @@ -2554,9 +2635,11 @@ _copyQuery(const Query *from) COPY_NODE_FIELD(setOperations); COPY_NODE_FIELD(constraintDeps); #ifdef PGXC +#ifndef XCP COPY_STRING_FIELD(sql_statement); COPY_SCALAR_FIELD(is_ins_child_sel_parent); #endif +#endif return newnode; } @@ -3926,6 +4009,17 @@ _copyBarrierStmt(const BarrierStmt *from) return newnode; } +#ifdef XCP +static PauseClusterStmt * +_copyPauseClusterStmt(const PauseClusterStmt *from) +{ + PauseClusterStmt *newnode = makeNode(PauseClusterStmt); + + COPY_SCALAR_FIELD(pause); + + return newnode; +} +#endif /* **************************************************************** * nodemgr.h copy functions * **************************************************************** @@ -4162,6 +4256,14 @@ copyObject(const void *from) retval = _copySimpleSort(from); break; #endif +#ifdef XCP + case T_RemoteSubplan: + retval = _copyRemoteSubplan(from); + break; + case T_Distribution: + retval = _copyDistribution(from); + break; +#endif /* * PRIMITIVE NODES */ @@ -4609,6 +4711,11 @@ copyObject(const void *from) case T_BarrierStmt: retval = _copyBarrierStmt(from); break; +#ifdef XCP + case T_PauseClusterStmt: + retval = _copyPauseClusterStmt(from); + break; +#endif case T_AlterNodeStmt: retval = _copyAlterNodeStmt(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 34885297bf..6817fe73a1 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -18,6 +18,11 @@ * "x" to be considered equal() to another reference to "x" in the query. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -186,11 +191,17 @@ static bool _equalAggref(const Aggref *a, const Aggref *b) { COMPARE_SCALAR_FIELD(aggfnoid); +#ifndef XCP + /* + * In XCP ignore aggtype difference because Phase 1 of aggregate have + * aggtype set to aggtrantype + */ COMPARE_SCALAR_FIELD(aggtype); #ifdef PGXC COMPARE_SCALAR_FIELD(aggtrantype); COMPARE_SCALAR_FIELD(agghas_collectfn); #endif /* PGXC */ +#endif /* XCP */ COMPARE_SCALAR_FIELD(aggcollid); COMPARE_SCALAR_FIELD(inputcollid); COMPARE_NODE_FIELD(args); @@ -931,8 +942,10 @@ _equalQuery(const Query *a, const Query *b) COMPARE_NODE_FIELD(constraintDeps); #ifdef PGXC +#ifndef XCP COMPARE_SCALAR_FIELD(is_ins_child_sel_parent); #endif +#endif return true; } @@ -2366,6 +2379,18 @@ _equalXmlSerialize(const XmlSerialize *a, const XmlSerialize *b) return true; } +#ifdef XCP +static bool +_equalDistribution(Distribution *a, Distribution *b) +{ + COMPARE_SCALAR_FIELD(distributionType); + COMPARE_NODE_FIELD(distributionExpr); + COMPARE_BITMAPSET_FIELD(nodes); + + return true; +} +#endif + /* * Stuff from pg_list.h */ @@ -2467,6 +2492,17 @@ _equalBarrierStmt(const BarrierStmt *a, const BarrierStmt *b) return true; } +#ifdef XCP +/* + * Lock Cluster stuff + */ +static bool +_equalPauseClusterStmt(PauseClusterStmt *a, PauseClusterStmt *b) +{ + COMPARE_SCALAR_FIELD(pause); + return true; +} +#endif /* * stuff from nodemgr.h */ @@ -2989,6 +3025,11 @@ equal(const void *a, const void *b) case T_BarrierStmt: retval = _equalBarrierStmt(a, b); break; +#ifdef XCP + case T_PauseClusterStmt: + retval = _equalPauseClusterStmt(a, b); + break; +#endif case T_AlterNodeStmt: retval = _equalAlterNodeStmt(a, b); break; @@ -3135,6 +3176,11 @@ equal(const void *a, const void *b) case T_XmlSerialize: retval = _equalXmlSerialize(a, b); break; +#ifdef XCP + case T_Distribution: + retval = _equalDistribution(a, b); + break; +#endif default: elog(ERROR, "unrecognized node type: %d", diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 080047c8e7..efb751176f 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -3,6 +3,11 @@ * outfuncs.c * Output functions for Postgres tree nodes. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -26,9 +31,35 @@ #include "lib/stringinfo.h" #include "nodes/plannodes.h" #include "nodes/relation.h" +#ifdef XCP +#include "fmgr.h" +#include "miscadmin.h" +#include "catalog/namespace.h" +#include "pgxc/execRemote.h" +#include "utils/lsyscache.h" +#endif #include "utils/datum.h" #ifdef PGXC -#include "optimizer/pgxcplan.h" +#include "pgxc/planner.h" +#endif + +#ifdef XCP +/* + * When we sending query plans between nodes we need to send OIDs of various + * objects - relations, data types, functions, etc. + * On different nodes OIDs of these objects may differ, so we need to send an + * identifier, depending on object type, allowing to lookup OID on target node. + * On the other hand we want to save space when storing rules, or in other cases + * when we need to encode and decode nodes on the same node. + * For now default format is not portable, as it is in original Postgres code. + * Later we may want to add extra parameter in nodeToString() function + */ +static bool portable_output = false; +void +set_portable_output(bool value) +{ + portable_output = value; +} #endif @@ -51,9 +82,16 @@ #define WRITE_UINT_FIELD(fldname) \ appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname) +#ifdef XCP +/* Only allow output OIDs in not portable mode */ +#define WRITE_OID_FIELD(fldname) \ + (AssertMacro(!portable_output), \ + appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname)) +#else /* Write an OID field (don't hard-wire assumption that OID is same as uint) */ #define WRITE_OID_FIELD(fldname) \ appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname) +#endif /* Write a long-integer field */ #define WRITE_LONG_FIELD(fldname) \ @@ -96,6 +134,98 @@ (appendStringInfo(str, " :" CppAsString(fldname) " "), \ _outBitmapset(str, node->fldname)) +#ifdef XCP +#define NSP_NAME(oid) \ + isTempNamespace(oid) ? "pg_temp" : get_namespace_name(oid) +/* + * Macros to encode OIDs to send to other nodes. Objects on other nodes may have + * different OIDs, so send instead an unique identifier allowing to lookup + * the OID on target node. The identifier depends on object type. + */ + +/* write an OID which is a relation OID */ +#define WRITE_RELID_FIELD(fldname) \ + (appendStringInfo(str, " :" CppAsString(fldname) " "), \ + _outToken(str, OidIsValid(node->fldname) ? NSP_NAME(get_rel_namespace(node->fldname)) : NULL), \ + appendStringInfoChar(str, ' '), \ + _outToken(str, OidIsValid(node->fldname) ? get_rel_name(node->fldname) : NULL)) + +/* write an OID which is a data type OID */ +#define WRITE_TYPID_FIELD(fldname) \ + (appendStringInfo(str, " :" CppAsString(fldname) " "), \ + _outToken(str, OidIsValid(node->fldname) ? NSP_NAME(get_typ_namespace(node->fldname)) : NULL), \ + appendStringInfoChar(str, ' '), \ + _outToken(str, OidIsValid(node->fldname) ? get_typ_name(node->fldname) : NULL)) + +/* write an OID which is a function OID */ +#define WRITE_FUNCID_FIELD(fldname) \ + do { \ + appendStringInfo(str, " :" CppAsString(fldname) " "); \ + if (OidIsValid(node->fldname)) \ + { \ + Oid *argtypes; \ + int i, nargs; \ + _outToken(str, NSP_NAME(get_func_namespace(node->fldname))); \ + appendStringInfoChar(str, ' '); \ + _outToken(str, get_func_name(node->fldname)); \ + appendStringInfoChar(str, ' '); \ + get_func_signature(node->fldname, &argtypes, &nargs); \ + appendStringInfo(str, "%d", nargs); \ + for (i = 0; i < nargs; i++) \ + { \ + appendStringInfoChar(str, ' '); \ + _outToken(str, NSP_NAME(get_typ_namespace(argtypes[i]))); \ + appendStringInfoChar(str, ' '); \ + _outToken(str, get_typ_name(argtypes[i])); \ + } \ + } \ + else \ + appendStringInfo(str, "<> <> 0"); \ + } while (0) + +/* write an OID which is an operator OID */ +#define WRITE_OPERID_FIELD(fldname) \ + do { \ + appendStringInfo(str, " :" CppAsString(fldname) " "); \ + if (OidIsValid(node->fldname)) \ + { \ + Oid oprleft, oprright; \ + _outToken(str, NSP_NAME(get_opnamespace(node->fldname))); \ + appendStringInfoChar(str, ' '); \ + _outToken(str, get_opname(node->fldname)); \ + appendStringInfoChar(str, ' '); \ + op_input_types(node->fldname, &oprleft, &oprright); \ + _outToken(str, OidIsValid(oprleft) ? \ + NSP_NAME(get_typ_namespace(oprleft)) : NULL); \ + appendStringInfoChar(str, ' '); \ + _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); \ + appendStringInfoChar(str, ' '); \ + _outToken(str, OidIsValid(oprright) ? \ + NSP_NAME(get_typ_namespace(oprright)) : NULL); \ + appendStringInfoChar(str, ' '); \ + _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); \ + appendStringInfoChar(str, ' '); \ + } \ + else \ + appendStringInfo(str, "<> <> <> <> <> <>"); \ + } while (0) + +/* write an OID which is a collation OID */ +#define WRITE_COLLID_FIELD(fldname) \ + do { \ + appendStringInfo(str, " :" CppAsString(fldname) " "); \ + if (OidIsValid(node->fldname)) \ + { \ + _outToken(str, NSP_NAME(get_collation_namespace(node->fldname))); \ + appendStringInfoChar(str, ' '); \ + _outToken(str, get_collation_name(node->fldname)); \ + appendStringInfo(str, " %d", get_collation_encoding(node->fldname)); \ + } \ + else \ + appendStringInfo(str, "<> <> -1"); \ + } while (0) + +#endif #define booltostr(x) ((x) ? "true" : "false") @@ -235,6 +365,48 @@ _outDatum(StringInfo str, Datum value, int typlen, bool typbyval) } +#ifdef XCP +/* + * Output value in text format + */ +static void +_printDatum(StringInfo str, Datum value, Oid typid) +{ + Oid typOutput; + bool typIsVarlena; + FmgrInfo finfo; + Datum tmpval; + char *textvalue; + int saveDateStyle; + + /* Get output function for the type */ + getTypeOutputInfo(typid, &typOutput, &typIsVarlena); + fmgr_info(typOutput, &finfo); + + /* Detoast value if needed */ + if (typIsVarlena) + tmpval = PointerGetDatum(PG_DETOAST_DATUM(value)); + else + tmpval = value; + + /* + * It was found that if configuration setting for date style is + * "postgres,ymd" the output dates have format DD-MM-YYYY and they can not + * be parsed correctly by receiving party. So force ISO format YYYY-MM-DD + * in internal cluster communications, these values are always parsed + * correctly. + */ + saveDateStyle = DateStyle; + DateStyle = USE_ISO_DATES; + + textvalue = DatumGetCString(FunctionCall1(&finfo, tmpval)); + _outToken(str, textvalue); + + DateStyle = saveDateStyle; +} +#endif + + /* * Stuff from plannodes.h */ @@ -339,8 +511,10 @@ _outModifyTable(StringInfo str, const ModifyTable *node) WRITE_NODE_FIELD(rowMarks); WRITE_INT_FIELD(epqParam); #ifdef PGXC +#ifndef XCP WRITE_NODE_FIELD(remote_plans); #endif +#endif } static void @@ -372,10 +546,52 @@ _outMergeAppend(StringInfo str, const MergeAppend *node) appendStringInfo(str, " :sortOperators"); for (i = 0; i < node->numCols; i++) +#ifdef XCP + if (portable_output) + { + Oid oper = node->sortOperators[i]; + Oid oprleft, oprright; + /* Sort operator is always valid */ + Assert(OidIsValid(oper)); + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_opnamespace(oper))); + appendStringInfoChar(str, ' '); + _outToken(str, get_opname(oper)); + appendStringInfoChar(str, ' '); + op_input_types(oper, &oprleft, &oprright); + _outToken(str, OidIsValid(oprleft) ? + NSP_NAME(get_typ_namespace(oprleft)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? + NSP_NAME(get_typ_namespace(oprright)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); + } + else +#endif appendStringInfo(str, " %u", node->sortOperators[i]); appendStringInfo(str, " :collations"); for (i = 0; i < node->numCols; i++) +#ifdef XCP + if (portable_output) + { + Oid coll = node->collations[i]; + if (OidIsValid(coll)) + { + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_collation_namespace(coll))); + appendStringInfoChar(str, ' '); + _outToken(str, get_collation_name(coll)); + appendStringInfo(str, " %d", get_collation_encoding(coll)); + } + else + appendStringInfo(str, " <> <> -1"); + } + else +#endif appendStringInfo(str, " %u", node->collations[i]); appendStringInfo(str, " :nullsFirst"); @@ -401,6 +617,32 @@ _outRecursiveUnion(StringInfo str, const RecursiveUnion *node) appendStringInfo(str, " :dupOperators"); for (i = 0; i < node->numCols; i++) +#ifdef XCP + if (portable_output) + { + Oid oper = node->dupOperators[i]; + Oid oprleft, oprright; + /* Unique operator is always valid */ + Assert(OidIsValid(oper)); + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_opnamespace(oper))); + appendStringInfoChar(str, ' '); + _outToken(str, get_opname(oper)); + appendStringInfoChar(str, ' '); + op_input_types(oper, &oprleft, &oprright); + _outToken(str, OidIsValid(oprleft) ? + NSP_NAME(get_typ_namespace(oprleft)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? + NSP_NAME(get_typ_namespace(oprright)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); + appendStringInfoChar(str, ' '); + } + else +#endif appendStringInfo(str, " %u", node->dupOperators[i]); WRITE_LONG_FIELD(numGroups); @@ -449,6 +691,11 @@ _outIndexScan(StringInfo str, const IndexScan *node) _outScanInfo(str, (const Scan *) node); +#ifdef XCP + if (portable_output) + WRITE_RELID_FIELD(indexid); + else +#endif WRITE_OID_FIELD(indexid); WRITE_NODE_FIELD(indexqual); WRITE_NODE_FIELD(indexqualorig); @@ -475,22 +722,18 @@ _outRemoteQuery(StringInfo str, const RemoteQuery *node) WRITE_BOOL_FIELD(force_autocommit); WRITE_STRING_FIELD(statement); WRITE_STRING_FIELD(cursor); - WRITE_INT_FIELD(rq_num_params); + WRITE_INT_FIELD(remote_num_params); - appendStringInfo(str, " :rq_param_types"); - for (i = 0; i < node->rq_num_params; i++) - appendStringInfo(str, " %d", node->rq_param_types[i]); + appendStringInfo(str, " :remote_param_types"); + for (i = 0; i < node->remote_num_params; i++) + appendStringInfo(str, " %d", node->remote_param_types[i]); WRITE_ENUM_FIELD(exec_type, RemoteQueryExecType); +#ifndef XCP WRITE_BOOL_FIELD(is_temp); +#endif WRITE_BOOL_FIELD(has_row_marks); - WRITE_BOOL_FIELD(rq_finalise_aggs); - WRITE_BOOL_FIELD(rq_sortgroup_colno); - WRITE_NODE_FIELD(remote_query); - WRITE_NODE_FIELD(coord_var_tlist); - WRITE_NODE_FIELD(query_var_tlist); WRITE_BOOL_FIELD(has_ins_child_sel_parent); - WRITE_BOOL_FIELD(rq_params_internal); } static void @@ -514,6 +757,11 @@ _outIndexOnlyScan(StringInfo str, const IndexOnlyScan *node) _outScanInfo(str, (const Scan *) node); +#ifdef XCP + if (portable_output) + WRITE_RELID_FIELD(indexid); + else +#endif WRITE_OID_FIELD(indexid); WRITE_NODE_FIELD(indexqual); WRITE_NODE_FIELD(indexorderby); @@ -528,6 +776,11 @@ _outBitmapIndexScan(StringInfo str, const BitmapIndexScan *node) _outScanInfo(str, (const Scan *) node); +#ifdef XCP + if (portable_output) + WRITE_RELID_FIELD(indexid); + else +#endif WRITE_OID_FIELD(indexid); WRITE_NODE_FIELD(indexqual); WRITE_NODE_FIELD(indexqualorig); @@ -658,6 +911,23 @@ _outMergeJoin(StringInfo str, const MergeJoin *node) appendStringInfo(str, " :mergeCollations"); for (i = 0; i < numCols; i++) +#ifdef XCP + if (portable_output) + { + Oid coll = node->mergeCollations[i]; + if (OidIsValid(coll)) + { + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_collation_namespace(coll))); + appendStringInfoChar(str, ' '); + _outToken(str, get_collation_name(coll)); + appendStringInfo(str, " %d", get_collation_encoding(coll)); + } + else + appendStringInfo(str, " <> <> -1"); + } + else +#endif appendStringInfo(str, " %u", node->mergeCollations[i]); appendStringInfo(str, " :mergeStrategies"); @@ -689,6 +959,9 @@ _outAgg(StringInfo str, const Agg *node) _outPlanInfo(str, (const Plan *) node); WRITE_ENUM_FIELD(aggstrategy, AggStrategy); +#ifdef XCP + WRITE_ENUM_FIELD(aggdistribution, AggDistribution); +#endif WRITE_INT_FIELD(numCols); appendStringInfo(str, " :grpColIdx"); @@ -697,6 +970,32 @@ _outAgg(StringInfo str, const Agg *node) appendStringInfo(str, " :grpOperators"); for (i = 0; i < node->numCols; i++) +#ifdef XCP + if (portable_output) + { + Oid oper = node->grpOperators[i]; + Oid oprleft, oprright; + /* Group operator is always valid */ + Assert(OidIsValid(oper)); + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_opnamespace(oper))); + appendStringInfoChar(str, ' '); + _outToken(str, get_opname(oper)); + appendStringInfoChar(str, ' '); + op_input_types(oper, &oprleft, &oprright); + _outToken(str, OidIsValid(oprleft) ? + NSP_NAME(get_typ_namespace(oprleft)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? + NSP_NAME(get_typ_namespace(oprright)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); + appendStringInfoChar(str, ' '); + } + else +#endif appendStringInfo(str, " %u", node->grpOperators[i]); WRITE_LONG_FIELD(numGroups); @@ -720,6 +1019,32 @@ _outWindowAgg(StringInfo str, const WindowAgg *node) appendStringInfo(str, " :partOperations"); for (i = 0; i < node->partNumCols; i++) +#ifdef XCP + if (portable_output) + { + Oid oper = node->partOperators[i]; + Oid oprleft, oprright; + /* The operator is always valid */ + Assert(OidIsValid(oper)); + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_opnamespace(oper))); + appendStringInfoChar(str, ' '); + _outToken(str, get_opname(oper)); + appendStringInfoChar(str, ' '); + op_input_types(oper, &oprleft, &oprright); + _outToken(str, OidIsValid(oprleft) ? + NSP_NAME(get_typ_namespace(oprleft)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? + NSP_NAME(get_typ_namespace(oprright)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); + appendStringInfoChar(str, ' '); + } + else +#endif appendStringInfo(str, " %u", node->partOperators[i]); WRITE_INT_FIELD(ordNumCols); @@ -730,6 +1055,32 @@ _outWindowAgg(StringInfo str, const WindowAgg *node) appendStringInfo(str, " :ordOperations"); for (i = 0; i < node->ordNumCols; i++) +#ifdef XCP + if (portable_output) + { + Oid oper = node->ordOperators[i]; + Oid oprleft, oprright; + /* Group operator is always valid */ + Assert(OidIsValid(oper)); + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_opnamespace(oper))); + appendStringInfoChar(str, ' '); + _outToken(str, get_opname(oper)); + appendStringInfoChar(str, ' '); + op_input_types(oper, &oprleft, &oprright); + _outToken(str, OidIsValid(oprleft) ? + NSP_NAME(get_typ_namespace(oprleft)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? + NSP_NAME(get_typ_namespace(oprright)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); + appendStringInfoChar(str, ' '); + } + else +#endif appendStringInfo(str, " %u", node->ordOperators[i]); WRITE_INT_FIELD(frameOptions); @@ -754,6 +1105,32 @@ _outGroup(StringInfo str, const Group *node) appendStringInfo(str, " :grpOperators"); for (i = 0; i < node->numCols; i++) +#ifdef XCP + if (portable_output) + { + Oid oper = node->grpOperators[i]; + Oid oprleft, oprright; + /* Group operator is always valid */ + Assert(OidIsValid(oper)); + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_opnamespace(oper))); + appendStringInfoChar(str, ' '); + _outToken(str, get_opname(oper)); + appendStringInfoChar(str, ' '); + op_input_types(oper, &oprleft, &oprright); + _outToken(str, OidIsValid(oprleft) ? + NSP_NAME(get_typ_namespace(oprleft)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? + NSP_NAME(get_typ_namespace(oprright)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); + appendStringInfoChar(str, ' '); + } + else +#endif appendStringInfo(str, " %u", node->grpOperators[i]); } @@ -782,10 +1159,52 @@ _outSort(StringInfo str, const Sort *node) appendStringInfo(str, " :sortOperators"); for (i = 0; i < node->numCols; i++) +#ifdef XCP + if (portable_output) + { + Oid oper = node->sortOperators[i]; + Oid oprleft, oprright; + /* Sort operator is always valid */ + Assert(OidIsValid(oper)); + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_opnamespace(oper))); + appendStringInfoChar(str, ' '); + _outToken(str, get_opname(oper)); + appendStringInfoChar(str, ' '); + op_input_types(oper, &oprleft, &oprright); + _outToken(str, OidIsValid(oprleft) ? + NSP_NAME(get_typ_namespace(oprleft)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? + NSP_NAME(get_typ_namespace(oprright)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); + } + else +#endif appendStringInfo(str, " %u", node->sortOperators[i]); appendStringInfo(str, " :collations"); for (i = 0; i < node->numCols; i++) +#ifdef XCP + if (portable_output) + { + Oid coll = node->collations[i]; + if (OidIsValid(coll)) + { + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_collation_namespace(coll))); + appendStringInfoChar(str, ' '); + _outToken(str, get_collation_name(coll)); + appendStringInfo(str, " %d", get_collation_encoding(coll)); + } + else + appendStringInfo(str, " <> <> -1"); + } + else +#endif appendStringInfo(str, " %u", node->collations[i]); appendStringInfo(str, " :nullsFirst"); @@ -810,6 +1229,32 @@ _outUnique(StringInfo str, const Unique *node) appendStringInfo(str, " :uniqOperators"); for (i = 0; i < node->numCols; i++) +#ifdef XCP + if (portable_output) + { + Oid oper = node->uniqOperators[i]; + Oid oprleft, oprright; + /* Unique operator is always valid */ + Assert(OidIsValid(oper)); + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_opnamespace(oper))); + appendStringInfoChar(str, ' '); + _outToken(str, get_opname(oper)); + appendStringInfoChar(str, ' '); + op_input_types(oper, &oprleft, &oprright); + _outToken(str, OidIsValid(oprleft) ? + NSP_NAME(get_typ_namespace(oprleft)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? + NSP_NAME(get_typ_namespace(oprright)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); + appendStringInfoChar(str, ' '); + } + else +#endif appendStringInfo(str, " %u", node->uniqOperators[i]); } @@ -820,9 +1265,19 @@ _outHash(StringInfo str, const Hash *node) _outPlanInfo(str, (const Plan *) node); +#ifdef XCP + if (portable_output) + WRITE_RELID_FIELD(skewTable); + else +#endif WRITE_OID_FIELD(skewTable); WRITE_INT_FIELD(skewColumn); WRITE_BOOL_FIELD(skewInherit); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(skewColType); + else +#endif WRITE_OID_FIELD(skewColType); WRITE_INT_FIELD(skewColTypmod); } @@ -846,6 +1301,32 @@ _outSetOp(StringInfo str, const SetOp *node) appendStringInfo(str, " :dupOperators"); for (i = 0; i < node->numCols; i++) +#ifdef XCP + if (portable_output) + { + Oid oper = node->dupOperators[i]; + Oid oprleft, oprright; + /* Unique operator is always valid */ + Assert(OidIsValid(oper)); + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_opnamespace(oper))); + appendStringInfoChar(str, ' '); + _outToken(str, get_opname(oper)); + appendStringInfoChar(str, ' '); + op_input_types(oper, &oprleft, &oprright); + _outToken(str, OidIsValid(oprleft) ? + NSP_NAME(get_typ_namespace(oprleft)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? + NSP_NAME(get_typ_namespace(oprright)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); + appendStringInfoChar(str, ' '); + } + else +#endif appendStringInfo(str, " %u", node->dupOperators[i]); WRITE_INT_FIELD(flagColIdx); @@ -875,6 +1356,135 @@ _outLimit(StringInfo str, const Limit *node) WRITE_NODE_FIELD(limitCount); } +#ifdef XCP +static void +_outRemoteSubplan(StringInfo str, const RemoteSubplan *node) +{ + WRITE_NODE_TYPE("REMOTESUBPLAN"); + + _outScanInfo(str, (Scan *) node); + + WRITE_CHAR_FIELD(distributionType); + WRITE_INT_FIELD(distributionKey); + WRITE_NODE_FIELD(distributionNodes); + WRITE_NODE_FIELD(distributionRestrict); + WRITE_NODE_FIELD(nodeList); + WRITE_BOOL_FIELD(execOnAll); + WRITE_NODE_FIELD(sort); + WRITE_STRING_FIELD(cursor); + WRITE_INT_FIELD(unique); +} + +static void +_outRemoteStmt(StringInfo str, const RemoteStmt *node) +{ + int i; + + WRITE_NODE_TYPE("REMOTESTMT"); + + WRITE_ENUM_FIELD(commandType, CmdType); + WRITE_BOOL_FIELD(hasReturning); + WRITE_NODE_FIELD(planTree); + WRITE_NODE_FIELD(rtable); + WRITE_NODE_FIELD(resultRelations); + WRITE_NODE_FIELD(subplans); + WRITE_INT_FIELD(nParamExec); + WRITE_INT_FIELD(nParamRemote); + + for (i = 0; i < node->nParamRemote; i++) + { + RemoteParam *rparam = &(node->remoteparams[i]); + appendStringInfo(str, " :paramkind"); + appendStringInfo(str, " %d", (int) rparam->paramkind); + + appendStringInfo(str, " :paramid"); + appendStringInfo(str, " %d", rparam->paramid); + + appendStringInfo(str, " :paramtype"); + if (portable_output) + { + Oid ptype = rparam->paramtype; + Assert(OidIsValid(ptype)); + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_typ_namespace(ptype))); + appendStringInfoChar(str, ' '); + _outToken(str, get_typ_name(ptype)); + } + else + appendStringInfo(str, " %u", rparam->paramtype); + } + WRITE_NODE_FIELD(rowMarks); + WRITE_CHAR_FIELD(distributionType); + WRITE_INT_FIELD(distributionKey); + WRITE_NODE_FIELD(distributionNodes); + WRITE_NODE_FIELD(distributionRestrict); +} + +static void +_outSimpleSort(StringInfo str, const SimpleSort *node) +{ + int i; + + WRITE_NODE_TYPE("SIMPLESORT"); + + WRITE_INT_FIELD(numCols); + + appendStringInfo(str, " :sortColIdx"); + for (i = 0; i < node->numCols; i++) + appendStringInfo(str, " %d", node->sortColIdx[i]); + + appendStringInfo(str, " :sortOperators"); + for (i = 0; i < node->numCols; i++) + if (portable_output) + { + Oid oper = node->sortOperators[i]; + Oid oprleft, oprright; + /* Sort operator is always valid */ + Assert(OidIsValid(oper)); + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_opnamespace(oper))); + appendStringInfoChar(str, ' '); + _outToken(str, get_opname(oper)); + appendStringInfoChar(str, ' '); + op_input_types(oper, &oprleft, &oprright); + _outToken(str, OidIsValid(oprleft) ? + NSP_NAME(get_typ_namespace(oprleft)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? + NSP_NAME(get_typ_namespace(oprright)) : NULL); + appendStringInfoChar(str, ' '); + _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); + } + else + appendStringInfo(str, " %u", node->sortOperators[i]); + + appendStringInfo(str, " :sortCollations"); + for (i = 0; i < node->numCols; i++) + if (portable_output) + { + Oid coll = node->sortCollations[i]; + if (OidIsValid(coll)) + { + appendStringInfoChar(str, ' '); + _outToken(str, NSP_NAME(get_collation_namespace(coll))); + appendStringInfoChar(str, ' '); + _outToken(str, get_collation_name(coll)); + appendStringInfo(str, " %d", get_collation_encoding(coll)); + } + else + appendStringInfo(str, " <> <> -1"); + } + else + appendStringInfo(str, " %u", node->sortCollations[i]); + + appendStringInfo(str, " :nullsFirst"); + for (i = 0; i < node->numCols; i++) + appendStringInfo(str, " %s", booltostr(node->nullsFirst[i])); +} +#endif + static void _outNestLoopParam(StringInfo str, const NestLoopParam *node) { @@ -958,8 +1568,18 @@ _outVar(StringInfo str, const Var *node) WRITE_UINT_FIELD(varno); WRITE_INT_FIELD(varattno); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(vartype); + else +#endif WRITE_OID_FIELD(vartype); WRITE_INT_FIELD(vartypmod); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(varcollid); + else +#endif WRITE_OID_FIELD(varcollid); WRITE_UINT_FIELD(varlevelsup); WRITE_UINT_FIELD(varnoold); @@ -972,8 +1592,18 @@ _outConst(StringInfo str, const Const *node) { WRITE_NODE_TYPE("CONST"); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(consttype); + else +#endif WRITE_OID_FIELD(consttype); WRITE_INT_FIELD(consttypmod); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(constcollid); + else +#endif WRITE_OID_FIELD(constcollid); WRITE_INT_FIELD(constlen); WRITE_BOOL_FIELD(constbyval); @@ -984,6 +1614,11 @@ _outConst(StringInfo str, const Const *node) if (node->constisnull) appendStringInfo(str, "<>"); else +#ifdef XCP + if (portable_output) + _printDatum(str, node->constvalue, node->consttype); + else +#endif _outDatum(str, node->constvalue, node->constlen, node->constbyval); } @@ -994,8 +1629,18 @@ _outParam(StringInfo str, const Param *node) WRITE_ENUM_FIELD(paramkind, ParamKind); WRITE_INT_FIELD(paramid); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(paramtype); + else +#endif WRITE_OID_FIELD(paramtype); WRITE_INT_FIELD(paramtypmod); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(paramcollid); + else +#endif WRITE_OID_FIELD(paramcollid); WRITE_LOCATION_FIELD(location); } @@ -1005,13 +1650,35 @@ _outAggref(StringInfo str, const Aggref *node) { WRITE_NODE_TYPE("AGGREF"); +#ifdef XCP + if (portable_output) + WRITE_FUNCID_FIELD(aggfnoid); + else +#endif WRITE_OID_FIELD(aggfnoid); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(aggtype); + else +#endif WRITE_OID_FIELD(aggtype); #ifdef PGXC +#ifndef XCP WRITE_OID_FIELD(aggtrantype); WRITE_BOOL_FIELD(agghas_collectfn); +#endif /* XCP */ #endif /* PGXC */ +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(aggcollid); + else +#endif WRITE_OID_FIELD(aggcollid); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(inputcollid); + else +#endif WRITE_OID_FIELD(inputcollid); WRITE_NODE_FIELD(args); WRITE_NODE_FIELD(aggorder); @@ -1026,9 +1693,29 @@ _outWindowFunc(StringInfo str, const WindowFunc *node) { WRITE_NODE_TYPE("WINDOWFUNC"); +#ifdef XCP + if (portable_output) + WRITE_FUNCID_FIELD(winfnoid); + else +#endif WRITE_OID_FIELD(winfnoid); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(wintype); + else +#endif WRITE_OID_FIELD(wintype); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(wincollid); + else +#endif WRITE_OID_FIELD(wincollid); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(inputcollid); + else +#endif WRITE_OID_FIELD(inputcollid); WRITE_NODE_FIELD(args); WRITE_UINT_FIELD(winref); @@ -1042,9 +1729,24 @@ _outArrayRef(StringInfo str, const ArrayRef *node) { WRITE_NODE_TYPE("ARRAYREF"); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(refarraytype); + else +#endif WRITE_OID_FIELD(refarraytype); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(refelemtype); + else +#endif WRITE_OID_FIELD(refelemtype); WRITE_INT_FIELD(reftypmod); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(refcollid); + else +#endif WRITE_OID_FIELD(refcollid); WRITE_NODE_FIELD(refupperindexpr); WRITE_NODE_FIELD(reflowerindexpr); @@ -1057,11 +1759,31 @@ _outFuncExpr(StringInfo str, const FuncExpr *node) { WRITE_NODE_TYPE("FUNCEXPR"); +#ifdef XCP + if (portable_output) + WRITE_FUNCID_FIELD(funcid); + else +#endif WRITE_OID_FIELD(funcid); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(funcresulttype); + else +#endif WRITE_OID_FIELD(funcresulttype); WRITE_BOOL_FIELD(funcretset); WRITE_ENUM_FIELD(funcformat, CoercionForm); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(funccollid); + else +#endif WRITE_OID_FIELD(funccollid); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(inputcollid); + else +#endif WRITE_OID_FIELD(inputcollid); WRITE_NODE_FIELD(args); WRITE_LOCATION_FIELD(location); @@ -1083,11 +1805,36 @@ _outOpExpr(StringInfo str, const OpExpr *node) { WRITE_NODE_TYPE("OPEXPR"); +#ifdef XCP + if (portable_output) + WRITE_OPERID_FIELD(opno); + else +#endif WRITE_OID_FIELD(opno); +#ifdef XCP + if (portable_output) + WRITE_FUNCID_FIELD(opfuncid); + else +#endif WRITE_OID_FIELD(opfuncid); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(opresulttype); + else +#endif WRITE_OID_FIELD(opresulttype); WRITE_BOOL_FIELD(opretset); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(opcollid); + else +#endif WRITE_OID_FIELD(opcollid); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(inputcollid); + else +#endif WRITE_OID_FIELD(inputcollid); WRITE_NODE_FIELD(args); WRITE_LOCATION_FIELD(location); @@ -1098,11 +1845,36 @@ _outDistinctExpr(StringInfo str, const DistinctExpr *node) { WRITE_NODE_TYPE("DISTINCTEXPR"); +#ifdef XCP + if (portable_output) + WRITE_OPERID_FIELD(opno); + else +#endif WRITE_OID_FIELD(opno); +#ifdef XCP + if (portable_output) + WRITE_FUNCID_FIELD(opfuncid); + else +#endif WRITE_OID_FIELD(opfuncid); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(opresulttype); + else +#endif WRITE_OID_FIELD(opresulttype); WRITE_BOOL_FIELD(opretset); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(opcollid); + else +#endif WRITE_OID_FIELD(opcollid); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(inputcollid); + else +#endif WRITE_OID_FIELD(inputcollid); WRITE_NODE_FIELD(args); WRITE_LOCATION_FIELD(location); @@ -1113,11 +1885,36 @@ _outNullIfExpr(StringInfo str, const NullIfExpr *node) { WRITE_NODE_TYPE("NULLIFEXPR"); +#ifdef XCP + if (portable_output) + WRITE_OPERID_FIELD(opno); + else +#endif WRITE_OID_FIELD(opno); +#ifdef XCP + if (portable_output) + WRITE_FUNCID_FIELD(opfuncid); + else +#endif WRITE_OID_FIELD(opfuncid); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(opresulttype); + else +#endif WRITE_OID_FIELD(opresulttype); WRITE_BOOL_FIELD(opretset); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(opcollid); + else +#endif WRITE_OID_FIELD(opcollid); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(inputcollid); + else +#endif WRITE_OID_FIELD(inputcollid); WRITE_NODE_FIELD(args); WRITE_LOCATION_FIELD(location); @@ -1128,9 +1925,24 @@ _outScalarArrayOpExpr(StringInfo str, const ScalarArrayOpExpr *node) { WRITE_NODE_TYPE("SCALARARRAYOPEXPR"); +#ifdef XCP + if (portable_output) + WRITE_OPERID_FIELD(opno); + else +#endif WRITE_OID_FIELD(opno); +#ifdef XCP + if (portable_output) + WRITE_FUNCID_FIELD(opfuncid); + else +#endif WRITE_OID_FIELD(opfuncid); WRITE_BOOL_FIELD(useOr); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(inputcollid); + else +#endif WRITE_OID_FIELD(inputcollid); WRITE_NODE_FIELD(args); WRITE_LOCATION_FIELD(location); @@ -1185,8 +1997,18 @@ _outSubPlan(StringInfo str, const SubPlan *node) WRITE_NODE_FIELD(paramIds); WRITE_INT_FIELD(plan_id); WRITE_STRING_FIELD(plan_name); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(firstColType); + else +#endif WRITE_OID_FIELD(firstColType); WRITE_INT_FIELD(firstColTypmod); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(firstColCollation); + else +#endif WRITE_OID_FIELD(firstColCollation); WRITE_BOOL_FIELD(useHashTable); WRITE_BOOL_FIELD(unknownEqFalse); @@ -1212,8 +2034,18 @@ _outFieldSelect(StringInfo str, const FieldSelect *node) WRITE_NODE_FIELD(arg); WRITE_INT_FIELD(fieldnum); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(resulttype); + else +#endif WRITE_OID_FIELD(resulttype); WRITE_INT_FIELD(resulttypmod); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(resultcollid); + else +#endif WRITE_OID_FIELD(resultcollid); } @@ -1225,6 +2057,11 @@ _outFieldStore(StringInfo str, const FieldStore *node) WRITE_NODE_FIELD(arg); WRITE_NODE_FIELD(newvals); WRITE_NODE_FIELD(fieldnums); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(resulttype); + else +#endif WRITE_OID_FIELD(resulttype); } @@ -1234,8 +2071,18 @@ _outRelabelType(StringInfo str, const RelabelType *node) WRITE_NODE_TYPE("RELABELTYPE"); WRITE_NODE_FIELD(arg); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(resulttype); + else +#endif WRITE_OID_FIELD(resulttype); WRITE_INT_FIELD(resulttypmod); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(resultcollid); + else +#endif WRITE_OID_FIELD(resultcollid); WRITE_ENUM_FIELD(relabelformat, CoercionForm); WRITE_LOCATION_FIELD(location); @@ -1247,7 +2094,17 @@ _outCoerceViaIO(StringInfo str, const CoerceViaIO *node) WRITE_NODE_TYPE("COERCEVIAIO"); WRITE_NODE_FIELD(arg); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(resulttype); + else +#endif WRITE_OID_FIELD(resulttype); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(resultcollid); + else +#endif WRITE_OID_FIELD(resultcollid); WRITE_ENUM_FIELD(coerceformat, CoercionForm); WRITE_LOCATION_FIELD(location); @@ -1259,9 +2116,24 @@ _outArrayCoerceExpr(StringInfo str, const ArrayCoerceExpr *node) WRITE_NODE_TYPE("ARRAYCOERCEEXPR"); WRITE_NODE_FIELD(arg); +#ifdef XCP + if (portable_output) + WRITE_FUNCID_FIELD(elemfuncid); + else +#endif WRITE_OID_FIELD(elemfuncid); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(resulttype); + else +#endif WRITE_OID_FIELD(resulttype); WRITE_INT_FIELD(resulttypmod); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(resultcollid); + else +#endif WRITE_OID_FIELD(resultcollid); WRITE_BOOL_FIELD(isExplicit); WRITE_ENUM_FIELD(coerceformat, CoercionForm); @@ -1274,6 +2146,11 @@ _outConvertRowtypeExpr(StringInfo str, const ConvertRowtypeExpr *node) WRITE_NODE_TYPE("CONVERTROWTYPEEXPR"); WRITE_NODE_FIELD(arg); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(resulttype); + else +#endif WRITE_OID_FIELD(resulttype); WRITE_ENUM_FIELD(convertformat, CoercionForm); WRITE_LOCATION_FIELD(location); @@ -1294,7 +2171,17 @@ _outCaseExpr(StringInfo str, const CaseExpr *node) { WRITE_NODE_TYPE("CASE"); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(casetype); + else +#endif WRITE_OID_FIELD(casetype); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(casecollid); + else +#endif WRITE_OID_FIELD(casecollid); WRITE_NODE_FIELD(arg); WRITE_NODE_FIELD(args); @@ -1317,8 +2204,18 @@ _outCaseTestExpr(StringInfo str, const CaseTestExpr *node) { WRITE_NODE_TYPE("CASETESTEXPR"); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(typeId); + else +#endif WRITE_OID_FIELD(typeId); WRITE_INT_FIELD(typeMod); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(collation); + else +#endif WRITE_OID_FIELD(collation); } @@ -1327,8 +2224,23 @@ _outArrayExpr(StringInfo str, const ArrayExpr *node) { WRITE_NODE_TYPE("ARRAY"); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(array_typeid); + else +#endif WRITE_OID_FIELD(array_typeid); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(array_collid); + else +#endif WRITE_OID_FIELD(array_collid); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(element_typeid); + else +#endif WRITE_OID_FIELD(element_typeid); WRITE_NODE_FIELD(elements); WRITE_BOOL_FIELD(multidims); @@ -1341,6 +2253,11 @@ _outRowExpr(StringInfo str, const RowExpr *node) WRITE_NODE_TYPE("ROW"); WRITE_NODE_FIELD(args); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(row_typeid); + else +#endif WRITE_OID_FIELD(row_typeid); WRITE_ENUM_FIELD(row_format, CoercionForm); WRITE_NODE_FIELD(colnames); @@ -1365,7 +2282,17 @@ _outCoalesceExpr(StringInfo str, const CoalesceExpr *node) { WRITE_NODE_TYPE("COALESCE"); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(coalescetype); + else +#endif WRITE_OID_FIELD(coalescetype); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(coalescecollid); + else +#endif WRITE_OID_FIELD(coalescecollid); WRITE_NODE_FIELD(args); WRITE_LOCATION_FIELD(location); @@ -1376,8 +2303,23 @@ _outMinMaxExpr(StringInfo str, const MinMaxExpr *node) { WRITE_NODE_TYPE("MINMAX"); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(minmaxtype); + else +#endif WRITE_OID_FIELD(minmaxtype); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(minmaxcollid); + else +#endif WRITE_OID_FIELD(minmaxcollid); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(inputcollid); + else +#endif WRITE_OID_FIELD(inputcollid); WRITE_ENUM_FIELD(op, MinMaxOp); WRITE_NODE_FIELD(args); @@ -1395,6 +2337,11 @@ _outXmlExpr(StringInfo str, const XmlExpr *node) WRITE_NODE_FIELD(arg_names); WRITE_NODE_FIELD(args); WRITE_ENUM_FIELD(xmloption, XmlOptionType); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(type); + else +#endif WRITE_OID_FIELD(type); WRITE_INT_FIELD(typmod); WRITE_LOCATION_FIELD(location); @@ -1425,8 +2372,18 @@ _outCoerceToDomain(StringInfo str, const CoerceToDomain *node) WRITE_NODE_TYPE("COERCETODOMAIN"); WRITE_NODE_FIELD(arg); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(resulttype); + else +#endif WRITE_OID_FIELD(resulttype); WRITE_INT_FIELD(resulttypmod); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(resultcollid); + else +#endif WRITE_OID_FIELD(resultcollid); WRITE_ENUM_FIELD(coercionformat, CoercionForm); WRITE_LOCATION_FIELD(location); @@ -1437,8 +2394,18 @@ _outCoerceToDomainValue(StringInfo str, const CoerceToDomainValue *node) { WRITE_NODE_TYPE("COERCETODOMAINVALUE"); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(typeId); + else +#endif WRITE_OID_FIELD(typeId); WRITE_INT_FIELD(typeMod); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(collation); + else +#endif WRITE_OID_FIELD(collation); WRITE_LOCATION_FIELD(location); } @@ -1448,8 +2415,18 @@ _outSetToDefault(StringInfo str, const SetToDefault *node) { WRITE_NODE_TYPE("SETTODEFAULT"); +#ifdef XCP + if (portable_output) + WRITE_TYPID_FIELD(typeId); + else +#endif WRITE_OID_FIELD(typeId); WRITE_INT_FIELD(typeMod); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(collation); + else +#endif WRITE_OID_FIELD(collation); WRITE_LOCATION_FIELD(location); } @@ -1473,6 +2450,11 @@ _outTargetEntry(StringInfo str, const TargetEntry *node) WRITE_INT_FIELD(resno); WRITE_STRING_FIELD(resname); WRITE_UINT_FIELD(ressortgroupref); +#ifdef XCP + if (portable_output) + WRITE_RELID_FIELD(resorigtbl); + else +#endif WRITE_OID_FIELD(resorigtbl); WRITE_INT_FIELD(resorigcol); WRITE_BOOL_FIELD(resjunk); @@ -1777,9 +2759,11 @@ _outPlannerInfo(StringInfo str, const PlannerInfo *node) WRITE_BOOL_FIELD(hasPseudoConstantQuals); WRITE_BOOL_FIELD(hasRecursion); #ifdef PGXC +#ifndef XCP WRITE_INT_FIELD(rs_alias_index); WRITE_NODE_FIELD(xc_rowMarks); -#endif +#endif /* XCP */ +#endif /* PGXC */ WRITE_INT_FIELD(wt_param_id); WRITE_BITMAPSET_FIELD(curOuterRels); WRITE_NODE_FIELD(curOuterParams); @@ -1853,6 +2837,11 @@ _outEquivalenceClass(StringInfo str, const EquivalenceClass *node) WRITE_NODE_TYPE("EQUIVALENCECLASS"); WRITE_NODE_FIELD(ec_opfamilies); +#ifdef XCP + if (portable_output) + WRITE_COLLID_FIELD(ec_collation); + else +#endif WRITE_OID_FIELD(ec_collation); WRITE_NODE_FIELD(ec_members); WRITE_NODE_FIELD(ec_sources); @@ -1964,6 +2953,11 @@ _outAppendRelInfo(StringInfo str, const AppendRelInfo *node) WRITE_OID_FIELD(parent_reltype); WRITE_OID_FIELD(child_reltype); WRITE_NODE_FIELD(translated_vars); +#ifdef XCP + if (portable_output) + WRITE_RELID_FIELD(parent_reloid); + else +#endif WRITE_OID_FIELD(parent_reloid); } @@ -2303,7 +3297,17 @@ _outSortGroupClause(StringInfo str, const SortGroupClause *node) WRITE_NODE_TYPE("SORTGROUPCLAUSE"); WRITE_UINT_FIELD(tleSortGroupRef); +#ifdef XCP + if (portable_output) + WRITE_OPERID_FIELD(eqop); + else +#endif WRITE_OID_FIELD(eqop); +#ifdef XCP + if (portable_output) + WRITE_OPERID_FIELD(sortop); + else +#endif WRITE_OID_FIELD(sortop); WRITE_BOOL_FIELD(nulls_first); WRITE_BOOL_FIELD(hashable); @@ -2388,12 +3392,19 @@ _outRangeTblEntry(StringInfo str, const RangeTblEntry *node) WRITE_NODE_FIELD(eref); WRITE_ENUM_FIELD(rtekind, RTEKind); #ifdef PGXC +#ifndef XCP WRITE_STRING_FIELD(relname); #endif +#endif switch (node->rtekind) { case RTE_RELATION: +#ifdef XCP + if (portable_output) + WRITE_RELID_FIELD(relid); + else +#endif WRITE_OID_FIELD(relid); WRITE_CHAR_FIELD(relkind); break; @@ -2436,6 +3447,12 @@ _outRangeTblEntry(StringInfo str, const RangeTblEntry *node) WRITE_BOOL_FIELD(inh); WRITE_BOOL_FIELD(inFromCl); WRITE_UINT_FIELD(requiredPerms); +#ifdef XCP + /* no check on data node, consider it is trusted */ + if (portable_output) + appendStringInfo(str, " :checkAsUser %u", InvalidOid); + else +#endif WRITE_OID_FIELD(checkAsUser); WRITE_BITMAPSET_FIELD(selectedCols); WRITE_BITMAPSET_FIELD(modifiedCols); @@ -2889,6 +3906,17 @@ _outNode(StringInfo str, const void *obj) case T_NestLoopParam: _outNestLoopParam(str, obj); break; +#ifdef XCP + case T_RemoteSubplan: + _outRemoteSubplan(str, obj); + break; + case T_RemoteStmt: + _outRemoteStmt(str, obj); + break; + case T_SimpleSort: + _outSimpleSort(str, obj); + break; +#endif case T_PlanRowMark: _outPlanRowMark(str, obj); break; diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 91b2bf26b1..ef59ee1c4a 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -3,6 +3,11 @@ * readfuncs.c * Reader functions for Postgres tree nodes. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -34,6 +39,32 @@ #ifdef PGXC #include "access/htup.h" #endif +#ifdef XCP +#include "fmgr.h" +#include "catalog/namespace.h" +#include "nodes/plannodes.h" +#include "pgxc/execRemote.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + + +/* + * When we sending query plans between nodes we need to send OIDs of various + * objects - relations, data types, functions, etc. + * On different nodes OIDs of these objects may differ, so we need to send an + * identifier, depending on object type, allowing to lookup OID on target node. + * On the other hand we want to save space when storing rules, or in other cases + * when we need to encode and decode nodes on the same node. + * For now default format is not portable, as it is in original Postgres code. + * Later we may want to add extra parameter in stringToNode() function + */ +static bool portable_input = false; +void +set_portable_input(bool value) +{ + portable_input = value; +} +#endif /* XCP */ /* * Macros to simplify reading of different kinds of fields. Use these @@ -71,11 +102,27 @@ token = pg_strtok(&length); /* get field value */ \ local_node->fldname = atoui(token) +#ifdef XCP +/* Read a long integer field (anything written as ":fldname %ld") */ +#define READ_LONG_FIELD(fldname) \ + token = pg_strtok(&length); /* skip :fldname */ \ + token = pg_strtok(&length); /* get field value */ \ + local_node->fldname = atol(token) +#endif + /* Read an OID field (don't hard-wire assumption that OID is same as uint) */ +#ifdef XCP +#define READ_OID_FIELD(fldname) \ + (AssertMacro(!portable_input), /* only allow to read OIDs within a node */ \ + token = pg_strtok(&length), /* skip :fldname */ \ + token = pg_strtok(&length), /* get field value */ \ + local_node->fldname = atooid(token)) +#else #define READ_OID_FIELD(fldname) \ token = pg_strtok(&length); /* skip :fldname */ \ token = pg_strtok(&length); /* get field value */ \ local_node->fldname = atooid(token) +#endif /* Read a char field (ie, one ascii character) */ #define READ_CHAR_FIELD(fldname) \ @@ -123,6 +170,203 @@ token = pg_strtok(&length); /* skip :fldname */ \ local_node->fldname = _readBitmapset() +#ifdef XCP +/* Read fields of a Plan node */ +#define READ_PLAN_FIELDS(nodeTypeName) \ + Plan *plan_node; \ + READ_LOCALS(nodeTypeName); \ + plan_node = (Plan *) local_node; \ + token = pg_strtok(&length); /* skip :startup_cost */ \ + token = pg_strtok(&length); /* get field value */ \ + plan_node->startup_cost = atof(token); \ + token = pg_strtok(&length); /* skip :total_cost */ \ + token = pg_strtok(&length); /* get field value */ \ + plan_node->total_cost = atof(token); \ + token = pg_strtok(&length); /* skip :plan_rows */ \ + token = pg_strtok(&length); /* get field value */ \ + plan_node->plan_rows = atof(token); \ + token = pg_strtok(&length); /* skip :plan_width */ \ + token = pg_strtok(&length); /* get field value */ \ + plan_node->plan_width = atoi(token); \ + token = pg_strtok(&length); /* skip :targetlist */ \ + plan_node->targetlist = nodeRead(NULL, 0); \ + token = pg_strtok(&length); /* skip :qual */ \ + plan_node->qual = nodeRead(NULL, 0); \ + token = pg_strtok(&length); /* skip :lefttree */ \ + plan_node->lefttree = nodeRead(NULL, 0); \ + token = pg_strtok(&length); /* skip :righttree */ \ + plan_node->righttree = nodeRead(NULL, 0); \ + token = pg_strtok(&length); /* skip :initPlan */ \ + plan_node->initPlan = nodeRead(NULL, 0); \ + token = pg_strtok(&length); /* skip :extParam */ \ + plan_node->extParam = _readBitmapset(); \ + token = pg_strtok(&length); /* skip :allParam */ \ + plan_node->allParam = _readBitmapset() + +/* Read fields of a Scan node */ +#define READ_SCAN_FIELDS(nodeTypeName) \ + Scan *scan_node; \ + READ_PLAN_FIELDS(nodeTypeName); \ + scan_node = (Scan *) local_node; \ + token = pg_strtok(&length); /* skip :scanrelid */ \ + token = pg_strtok(&length); /* get field value */ \ + scan_node->scanrelid = atoi(token) + +/* Read fields of a Join node */ +#define READ_JOIN_FIELDS(nodeTypeName) \ + Join *join_node; \ + READ_PLAN_FIELDS(nodeTypeName); \ + join_node = (Join *) local_node; \ + token = pg_strtok(&length); /* skip :jointype */ \ + token = pg_strtok(&length); /* get field value */ \ + join_node->jointype = (JoinType) atoi(token); \ + token = pg_strtok(&length); /* skip :joinqual */ \ + join_node->joinqual = nodeRead(NULL, 0) + +/* + * Macros to read an identifier and lookup the OID + * The identifier depends on object type. + */ +#define NSP_OID(nspname) LookupNamespaceNoError(nspname) + +/* Read relation identifier and lookup the OID */ +#define READ_RELID_FIELD(fldname) \ + do { \ + char *nspname; /* namespace name */ \ + char *relname; /* relation name */ \ + token = pg_strtok(&length); /* skip :fldname */ \ + token = pg_strtok(&length); /* get nspname */ \ + nspname = nullable_string(token, length); \ + token = pg_strtok(&length); /* get relname */ \ + relname = nullable_string(token, length); \ + if (relname) \ + local_node->fldname = get_relname_relid(relname, \ + NSP_OID(nspname)); \ + else \ + local_node->fldname = InvalidOid; \ + } while (0) + +/* Read data type identifier and lookup the OID */ +#define READ_TYPID_FIELD(fldname) \ + do { \ + char *nspname; /* namespace name */ \ + char *typname; /* data type name */ \ + token = pg_strtok(&length); /* skip :fldname */ \ + token = pg_strtok(&length); /* get nspname */ \ + nspname = nullable_string(token, length); \ + token = pg_strtok(&length); /* get typname */ \ + typname = nullable_string(token, length); \ + if (typname) \ + local_node->fldname = get_typname_typid(typname, \ + NSP_OID(nspname)); \ + else \ + local_node->fldname = InvalidOid; \ + } while (0) + +/* Read function identifier and lookup the OID */ +#define READ_FUNCID_FIELD(fldname) \ + do { \ + char *nspname; /* namespace name */ \ + char *funcname; /* function name */ \ + int nargs; /* number of arguments */ \ + Oid *argtypes; /* argument types */ \ + token = pg_strtok(&length); /* skip :fldname */ \ + token = pg_strtok(&length); /* get nspname */ \ + nspname = nullable_string(token, length); \ + token = pg_strtok(&length); /* get funcname */ \ + funcname = nullable_string(token, length); \ + token = pg_strtok(&length); /* get nargs */ \ + nargs = atoi(token); \ + if (funcname) \ + { \ + int i; \ + argtypes = palloc(nargs * sizeof(Oid)); \ + for (i = 0; i < nargs; i++) \ + { \ + char *typnspname; /* argument type namespace */ \ + char *typname; /* argument type name */ \ + token = pg_strtok(&length); /* get type nspname */ \ + typnspname = nullable_string(token, length); \ + token = pg_strtok(&length); /* get type name */ \ + typname = nullable_string(token, length); \ + argtypes[i] = get_typname_typid(typname, \ + NSP_OID(typnspname)); \ + } \ + local_node->fldname = get_funcid(funcname, \ + buildoidvector(argtypes, nargs), \ + NSP_OID(nspname)); \ + } \ + else \ + local_node->fldname = InvalidOid; \ + } while (0) + +/* Read operator identifier and lookup the OID */ +#define READ_OPERID_FIELD(fldname) \ + do { \ + char *nspname; /* namespace name */ \ + char *oprname; /* operator name */ \ + char *leftnspname; /* left type namespace */ \ + char *leftname; /* left type name */ \ + Oid oprleft; /* left type */ \ + char *rightnspname; /* right type namespace */ \ + char *rightname; /* right type name */ \ + Oid oprright; /* right type */ \ + token = pg_strtok(&length); /* skip :fldname */ \ + token = pg_strtok(&length); /* get nspname */ \ + nspname = nullable_string(token, length); \ + token = pg_strtok(&length); /* get operator name */ \ + oprname = nullable_string(token, length); \ + token = pg_strtok(&length); /* left type namespace */ \ + leftnspname = nullable_string(token, length); \ + token = pg_strtok(&length); /* left type name */ \ + leftname = nullable_string(token, length); \ + token = pg_strtok(&length); /* right type namespace */ \ + rightnspname = nullable_string(token, length); \ + token = pg_strtok(&length); /* right type name */ \ + rightname = nullable_string(token, length); \ + if (oprname) \ + { \ + if (leftname) \ + oprleft = get_typname_typid(leftname, \ + NSP_OID(leftnspname)); \ + else \ + oprleft = InvalidOid; \ + if (rightname) \ + oprright = get_typname_typid(rightname, \ + NSP_OID(rightnspname)); \ + else \ + oprright = InvalidOid; \ + local_node->fldname = get_operid(oprname, \ + oprleft, \ + oprright, \ + NSP_OID(nspname)); \ + } \ + else \ + local_node->fldname = InvalidOid; \ + } while (0) + +/* Read collation identifier and lookup the OID */ +#define READ_COLLID_FIELD(fldname) \ + do { \ + char *nspname; /* namespace name */ \ + char *collname; /* collation name */ \ + int collencoding; /* collation encoding */ \ + token = pg_strtok(&length); /* skip :fldname */ \ + token = pg_strtok(&length); /* get nspname */ \ + nspname = nullable_string(token, length); \ + token = pg_strtok(&length); /* get collname */ \ + collname = nullable_string(token, length); \ + token = pg_strtok(&length); /* get collencoding */ \ + collencoding = atoi(token); \ + if (collname) \ + local_node->fldname = get_collid(collname, \ + collencoding, \ + NSP_OID(nspname)); \ + else \ + local_node->fldname = InvalidOid; \ + } while (0) +#endif + /* Routine exit */ #define READ_DONE() \ return local_node @@ -145,6 +389,9 @@ static Datum readDatum(bool typbyval); +#ifdef XCP +static Datum scanDatum(Oid typid, int typmod); +#endif /* * _readBitmapset @@ -266,7 +513,17 @@ _readSortGroupClause(void) READ_LOCALS(SortGroupClause); READ_UINT_FIELD(tleSortGroupRef); +#ifdef XCP + if (portable_input) + READ_OPERID_FIELD(eqop); + else +#endif READ_OID_FIELD(eqop); +#ifdef XCP + if (portable_input) + READ_OPERID_FIELD(sortop); + else +#endif READ_OID_FIELD(sortop); READ_BOOL_FIELD(nulls_first); READ_BOOL_FIELD(hashable); @@ -412,8 +669,18 @@ _readVar(void) READ_UINT_FIELD(varno); READ_INT_FIELD(varattno); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(vartype); + else +#endif READ_OID_FIELD(vartype); READ_INT_FIELD(vartypmod); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(varcollid); + else +#endif READ_OID_FIELD(varcollid); READ_UINT_FIELD(varlevelsup); READ_UINT_FIELD(varnoold); @@ -431,8 +698,18 @@ _readConst(void) { READ_LOCALS(Const); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(consttype); + else +#endif READ_OID_FIELD(consttype); READ_INT_FIELD(consttypmod); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(constcollid); + else +#endif READ_OID_FIELD(constcollid); READ_INT_FIELD(constlen); READ_BOOL_FIELD(constbyval); @@ -443,6 +720,12 @@ _readConst(void) if (local_node->constisnull) token = pg_strtok(&length); /* skip "<>" */ else +#ifdef XCP + if (portable_input) + local_node->constvalue = scanDatum(local_node->consttype, + local_node->consttypmod); + else +#endif local_node->constvalue = readDatum(local_node->constbyval); READ_DONE(); @@ -458,8 +741,18 @@ _readParam(void) READ_ENUM_FIELD(paramkind, ParamKind); READ_INT_FIELD(paramid); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(paramtype); + else +#endif READ_OID_FIELD(paramtype); READ_INT_FIELD(paramtypmod); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(paramcollid); + else +#endif READ_OID_FIELD(paramcollid); READ_LOCATION_FIELD(location); @@ -474,13 +767,35 @@ _readAggref(void) { READ_LOCALS(Aggref); +#ifdef XCP + if (portable_input) + READ_FUNCID_FIELD(aggfnoid); + else +#endif READ_OID_FIELD(aggfnoid); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(aggtype); + else +#endif READ_OID_FIELD(aggtype); #ifdef PGXC +#ifndef XCP READ_OID_FIELD(aggtrantype); READ_BOOL_FIELD(agghas_collectfn); +#endif /* XCP */ #endif /* PGXC */ +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(aggcollid); + else +#endif READ_OID_FIELD(aggcollid); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(inputcollid); + else +#endif READ_OID_FIELD(inputcollid); READ_NODE_FIELD(args); READ_NODE_FIELD(aggorder); @@ -500,9 +815,29 @@ _readWindowFunc(void) { READ_LOCALS(WindowFunc); +#ifdef XCP + if (portable_input) + READ_FUNCID_FIELD(winfnoid); + else +#endif READ_OID_FIELD(winfnoid); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(wintype); + else +#endif READ_OID_FIELD(wintype); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(wincollid); + else +#endif READ_OID_FIELD(wincollid); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(inputcollid); + else +#endif READ_OID_FIELD(inputcollid); READ_NODE_FIELD(args); READ_UINT_FIELD(winref); @@ -521,9 +856,24 @@ _readArrayRef(void) { READ_LOCALS(ArrayRef); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(refarraytype); + else +#endif READ_OID_FIELD(refarraytype); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(refelemtype); + else +#endif READ_OID_FIELD(refelemtype); READ_INT_FIELD(reftypmod); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(refcollid); + else +#endif READ_OID_FIELD(refcollid); READ_NODE_FIELD(refupperindexpr); READ_NODE_FIELD(reflowerindexpr); @@ -541,11 +891,31 @@ _readFuncExpr(void) { READ_LOCALS(FuncExpr); +#ifdef XCP + if (portable_input) + READ_FUNCID_FIELD(funcid); + else +#endif READ_OID_FIELD(funcid); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(funcresulttype); + else +#endif READ_OID_FIELD(funcresulttype); READ_BOOL_FIELD(funcretset); READ_ENUM_FIELD(funcformat, CoercionForm); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(funccollid); + else +#endif READ_OID_FIELD(funccollid); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(inputcollid); + else +#endif READ_OID_FIELD(inputcollid); READ_NODE_FIELD(args); READ_LOCATION_FIELD(location); @@ -577,9 +947,20 @@ _readOpExpr(void) { READ_LOCALS(OpExpr); +#ifdef XCP + if (portable_input) + READ_OPERID_FIELD(opno); + else +#endif READ_OID_FIELD(opno); +#ifdef XCP + if (portable_input) + READ_FUNCID_FIELD(opfuncid); + else +#endif READ_OID_FIELD(opfuncid); +#ifndef XCP /* * The opfuncid is stored in the textual format primarily for debugging * and documentation reasons. We want to always read it as zero to force @@ -589,10 +970,26 @@ _readOpExpr(void) * someday.) */ local_node->opfuncid = InvalidOid; +#endif +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(opresulttype); + else +#endif READ_OID_FIELD(opresulttype); READ_BOOL_FIELD(opretset); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(opcollid); + else +#endif READ_OID_FIELD(opcollid); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(inputcollid); + else +#endif READ_OID_FIELD(inputcollid); READ_NODE_FIELD(args); READ_LOCATION_FIELD(location); @@ -608,9 +1005,20 @@ _readDistinctExpr(void) { READ_LOCALS(DistinctExpr); +#ifdef XCP + if (portable_input) + READ_OPERID_FIELD(opno); + else +#endif READ_OID_FIELD(opno); +#ifdef XCP + if (portable_input) + READ_FUNCID_FIELD(opfuncid); + else +#endif READ_OID_FIELD(opfuncid); +#ifndef XCP /* * The opfuncid is stored in the textual format primarily for debugging * and documentation reasons. We want to always read it as zero to force @@ -620,10 +1028,26 @@ _readDistinctExpr(void) * someday.) */ local_node->opfuncid = InvalidOid; +#endif +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(opresulttype); + else +#endif READ_OID_FIELD(opresulttype); READ_BOOL_FIELD(opretset); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(opcollid); + else +#endif READ_OID_FIELD(opcollid); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(inputcollid); + else +#endif READ_OID_FIELD(inputcollid); READ_NODE_FIELD(args); READ_LOCATION_FIELD(location); @@ -639,7 +1063,17 @@ _readNullIfExpr(void) { READ_LOCALS(NullIfExpr); +#ifdef XCP + if (portable_input) + READ_OPERID_FIELD(opno); + else +#endif READ_OID_FIELD(opno); +#ifdef XCP + if (portable_input) + READ_FUNCID_FIELD(opfuncid); + else +#endif READ_OID_FIELD(opfuncid); /* @@ -650,11 +1084,30 @@ _readNullIfExpr(void) * (We don't currently support an ALTER OPERATOR command, but might * someday.) */ +#ifdef XCP + /* Do not invalidate if we have just looked up the value */ + if (!portable_input) +#endif local_node->opfuncid = InvalidOid; +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(opresulttype); + else +#endif READ_OID_FIELD(opresulttype); READ_BOOL_FIELD(opretset); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(opcollid); + else +#endif READ_OID_FIELD(opcollid); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(inputcollid); + else +#endif READ_OID_FIELD(inputcollid); READ_NODE_FIELD(args); READ_LOCATION_FIELD(location); @@ -670,9 +1123,19 @@ _readScalarArrayOpExpr(void) { READ_LOCALS(ScalarArrayOpExpr); +#ifdef XCP + if (portable_input) + READ_OPERID_FIELD(opno); + else +#endif READ_OID_FIELD(opno); +#ifdef XCP + if (portable_input) + READ_FUNCID_FIELD(opfuncid); + else +#endif READ_OID_FIELD(opfuncid); - +#ifndef XCP /* * The opfuncid is stored in the textual format primarily for debugging * and documentation reasons. We want to always read it as zero to force @@ -682,8 +1145,14 @@ _readScalarArrayOpExpr(void) * someday.) */ local_node->opfuncid = InvalidOid; +#endif READ_BOOL_FIELD(useOr); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(inputcollid); + else +#endif READ_OID_FIELD(inputcollid); READ_NODE_FIELD(args); READ_LOCATION_FIELD(location); @@ -734,9 +1203,40 @@ _readSubLink(void) READ_DONE(); } +#ifdef XCP /* * _readSubPlan is not needed since it doesn't appear in stored rules. */ +static SubPlan * +_readSubPlan(void) +{ + READ_LOCALS(SubPlan); + + READ_ENUM_FIELD(subLinkType, SubLinkType); + READ_NODE_FIELD(testexpr); + READ_NODE_FIELD(paramIds); + READ_INT_FIELD(plan_id); + READ_STRING_FIELD(plan_name); + if (portable_input) + READ_TYPID_FIELD(firstColType); + else + READ_OID_FIELD(firstColType); + READ_INT_FIELD(firstColTypmod); + if (portable_input) + READ_COLLID_FIELD(firstColCollation); + else + READ_OID_FIELD(firstColCollation); + READ_BOOL_FIELD(useHashTable); + READ_BOOL_FIELD(unknownEqFalse); + READ_NODE_FIELD(setParam); + READ_NODE_FIELD(parParam); + READ_NODE_FIELD(args); + READ_FLOAT_FIELD(startup_cost); + READ_FLOAT_FIELD(per_call_cost); + + READ_DONE(); +} +#endif /* * _readFieldSelect @@ -748,8 +1248,18 @@ _readFieldSelect(void) READ_NODE_FIELD(arg); READ_INT_FIELD(fieldnum); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(resulttype); + else +#endif READ_OID_FIELD(resulttype); READ_INT_FIELD(resulttypmod); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(resultcollid); + else +#endif READ_OID_FIELD(resultcollid); READ_DONE(); @@ -766,6 +1276,11 @@ _readFieldStore(void) READ_NODE_FIELD(arg); READ_NODE_FIELD(newvals); READ_NODE_FIELD(fieldnums); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(resulttype); + else +#endif READ_OID_FIELD(resulttype); READ_DONE(); @@ -780,8 +1295,18 @@ _readRelabelType(void) READ_LOCALS(RelabelType); READ_NODE_FIELD(arg); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(resulttype); + else +#endif READ_OID_FIELD(resulttype); READ_INT_FIELD(resulttypmod); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(resultcollid); + else +#endif READ_OID_FIELD(resultcollid); READ_ENUM_FIELD(relabelformat, CoercionForm); READ_LOCATION_FIELD(location); @@ -798,7 +1323,17 @@ _readCoerceViaIO(void) READ_LOCALS(CoerceViaIO); READ_NODE_FIELD(arg); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(resulttype); + else +#endif READ_OID_FIELD(resulttype); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(resultcollid); + else +#endif READ_OID_FIELD(resultcollid); READ_ENUM_FIELD(coerceformat, CoercionForm); READ_LOCATION_FIELD(location); @@ -815,9 +1350,24 @@ _readArrayCoerceExpr(void) READ_LOCALS(ArrayCoerceExpr); READ_NODE_FIELD(arg); +#ifdef XCP + if (portable_input) + READ_FUNCID_FIELD(elemfuncid); + else +#endif READ_OID_FIELD(elemfuncid); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(resulttype); + else +#endif READ_OID_FIELD(resulttype); READ_INT_FIELD(resulttypmod); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(resultcollid); + else +#endif READ_OID_FIELD(resultcollid); READ_BOOL_FIELD(isExplicit); READ_ENUM_FIELD(coerceformat, CoercionForm); @@ -835,6 +1385,11 @@ _readConvertRowtypeExpr(void) READ_LOCALS(ConvertRowtypeExpr); READ_NODE_FIELD(arg); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(resulttype); + else +#endif READ_OID_FIELD(resulttype); READ_ENUM_FIELD(convertformat, CoercionForm); READ_LOCATION_FIELD(location); @@ -865,7 +1420,17 @@ _readCaseExpr(void) { READ_LOCALS(CaseExpr); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(casetype); + else +#endif READ_OID_FIELD(casetype); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(casecollid); + else +#endif READ_OID_FIELD(casecollid); READ_NODE_FIELD(arg); READ_NODE_FIELD(args); @@ -898,8 +1463,18 @@ _readCaseTestExpr(void) { READ_LOCALS(CaseTestExpr); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(typeId); + else +#endif READ_OID_FIELD(typeId); READ_INT_FIELD(typeMod); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(collation); + else +#endif READ_OID_FIELD(collation); READ_DONE(); @@ -913,8 +1488,23 @@ _readArrayExpr(void) { READ_LOCALS(ArrayExpr); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(array_typeid); + else +#endif READ_OID_FIELD(array_typeid); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(array_collid); + else +#endif READ_OID_FIELD(array_collid); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(element_typeid); + else +#endif READ_OID_FIELD(element_typeid); READ_NODE_FIELD(elements); READ_BOOL_FIELD(multidims); @@ -932,6 +1522,11 @@ _readRowExpr(void) READ_LOCALS(RowExpr); READ_NODE_FIELD(args); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(row_typeid); + else +#endif READ_OID_FIELD(row_typeid); READ_ENUM_FIELD(row_format, CoercionForm); READ_NODE_FIELD(colnames); @@ -966,7 +1561,17 @@ _readCoalesceExpr(void) { READ_LOCALS(CoalesceExpr); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(coalescetype); + else +#endif READ_OID_FIELD(coalescetype); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(coalescecollid); + else +#endif READ_OID_FIELD(coalescecollid); READ_NODE_FIELD(args); READ_LOCATION_FIELD(location); @@ -982,8 +1587,23 @@ _readMinMaxExpr(void) { READ_LOCALS(MinMaxExpr); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(minmaxtype); + else +#endif READ_OID_FIELD(minmaxtype); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(minmaxcollid); + else +#endif READ_OID_FIELD(minmaxcollid); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(inputcollid); + else +#endif READ_OID_FIELD(inputcollid); READ_ENUM_FIELD(op, MinMaxOp); READ_NODE_FIELD(args); @@ -1006,6 +1626,11 @@ _readXmlExpr(void) READ_NODE_FIELD(arg_names); READ_NODE_FIELD(args); READ_ENUM_FIELD(xmloption, XmlOptionType); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(type); + else +#endif READ_OID_FIELD(type); READ_INT_FIELD(typmod); READ_LOCATION_FIELD(location); @@ -1051,8 +1676,18 @@ _readCoerceToDomain(void) READ_LOCALS(CoerceToDomain); READ_NODE_FIELD(arg); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(resulttype); + else +#endif READ_OID_FIELD(resulttype); READ_INT_FIELD(resulttypmod); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(resultcollid); + else +#endif READ_OID_FIELD(resultcollid); READ_ENUM_FIELD(coercionformat, CoercionForm); READ_LOCATION_FIELD(location); @@ -1068,8 +1703,18 @@ _readCoerceToDomainValue(void) { READ_LOCALS(CoerceToDomainValue); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(typeId); + else +#endif READ_OID_FIELD(typeId); READ_INT_FIELD(typeMod); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(collation); + else +#endif READ_OID_FIELD(collation); READ_LOCATION_FIELD(location); @@ -1084,8 +1729,18 @@ _readSetToDefault(void) { READ_LOCALS(SetToDefault); +#ifdef XCP + if (portable_input) + READ_TYPID_FIELD(typeId); + else +#endif READ_OID_FIELD(typeId); READ_INT_FIELD(typeMod); +#ifdef XCP + if (portable_input) + READ_COLLID_FIELD(collation); + else +#endif READ_OID_FIELD(collation); READ_LOCATION_FIELD(location); @@ -1119,6 +1774,11 @@ _readTargetEntry(void) READ_INT_FIELD(resno); READ_STRING_FIELD(resname); READ_UINT_FIELD(ressortgroupref); +#ifdef XCP + if (portable_input) + READ_RELID_FIELD(resorigtbl); + else +#endif READ_OID_FIELD(resorigtbl); READ_INT_FIELD(resorigcol); READ_BOOL_FIELD(resjunk); @@ -1191,12 +1851,19 @@ _readRangeTblEntry(void) READ_NODE_FIELD(eref); READ_ENUM_FIELD(rtekind, RTEKind); #ifdef PGXC +#ifndef XCP READ_STRING_FIELD(relname); #endif +#endif switch (local_node->rtekind) { case RTE_RELATION: +#ifdef XCP + if (portable_input) + READ_RELID_FIELD(relid); + else +#endif READ_OID_FIELD(relid); READ_CHAR_FIELD(relkind); break; @@ -1240,6 +1907,16 @@ _readRangeTblEntry(void) READ_BOOL_FIELD(inh); READ_BOOL_FIELD(inFromCl); READ_UINT_FIELD(requiredPerms); +#ifdef XCP + if (portable_input) + { + local_node->requiredPerms = 0; /* no permission checks on data node */ + token = pg_strtok(&length); /* skip :fldname */ \ + token = pg_strtok(&length); /* skip field value */ \ + local_node->checkAsUser = InvalidOid; + } + else +#endif READ_OID_FIELD(checkAsUser); READ_BITMAPSET_FIELD(selectedCols); READ_BITMAPSET_FIELD(modifiedCols); @@ -1248,6 +1925,1310 @@ _readRangeTblEntry(void) } +#ifdef XCP +/* + * _readPlan + */ +static Plan * +_readPlan(void) +{ + READ_PLAN_FIELDS(Plan); + + READ_DONE(); +} + + + +/* + * _readResult + */ +static Result * +_readResult(void) +{ + READ_PLAN_FIELDS(Result); + + READ_NODE_FIELD(resconstantqual); + + READ_DONE(); +} + + +/* + * _readModifyTable + */ +static ModifyTable * +_readModifyTable(void) +{ + READ_PLAN_FIELDS(ModifyTable); + + READ_ENUM_FIELD(operation, CmdType); + READ_BOOL_FIELD(canSetTag); + READ_NODE_FIELD(resultRelations); + READ_INT_FIELD(resultRelIndex); + READ_NODE_FIELD(plans); + READ_NODE_FIELD(returningLists); + READ_NODE_FIELD(rowMarks); + READ_INT_FIELD(epqParam); + + READ_DONE(); +} + + +/* + * _readAppend + */ +static Append * +_readAppend(void) +{ + READ_PLAN_FIELDS(Append); + + READ_NODE_FIELD(appendplans); + + READ_DONE(); +} + + +/* + * _readMergeAppend + */ +static MergeAppend * +_readMergeAppend(void) +{ + int i; + READ_PLAN_FIELDS(MergeAppend); + + READ_NODE_FIELD(mergeplans); + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :sortColIdx */ + local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->sortColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :sortOperators */ + local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->sortOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->sortOperators[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :collations */ + local_node->collations = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *collname; /* collation name */ + int collencoding; /* collation encoding */ + /* the token is already read */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get collname */ + collname = nullable_string(token, length); + token = pg_strtok(&length); /* get nargs */ + collencoding = atoi(token); + if (collname) + local_node->collations[i] = get_collid(collname, + collencoding, + NSP_OID(nspname)); + else + local_node->collations[i] = InvalidOid; + } + else + local_node->collations[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :nullsFirst */ + local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->nullsFirst[i] = strtobool(token); + } + + READ_DONE(); +} + + +/* + * _readRecursiveUnion + */ +static RecursiveUnion * +_readRecursiveUnion(void) +{ + int i; + READ_PLAN_FIELDS(RecursiveUnion); + + READ_INT_FIELD(wtParam); + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :dupColIdx */ + local_node->dupColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->dupColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :dupOperators */ + local_node->dupOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->dupOperators[i] = atooid(token); + } + + READ_LONG_FIELD(numGroups); + + READ_DONE(); +} + + +/* + * _readBitmapAnd + */ +static BitmapAnd * +_readBitmapAnd(void) +{ + READ_PLAN_FIELDS(BitmapAnd); + + READ_NODE_FIELD(bitmapplans); + + READ_DONE(); +} + + +/* + * _readBitmapOr + */ +static BitmapOr * +_readBitmapOr(void) +{ + READ_PLAN_FIELDS(BitmapOr); + + READ_NODE_FIELD(bitmapplans); + + READ_DONE(); +} + + +/* + * _readScan + */ +static Scan * +_readScan(void) +{ + READ_SCAN_FIELDS(Scan); + + READ_DONE(); +} + + +/* + * _readSeqScan + */ +static SeqScan * +_readSeqScan(void) +{ + READ_SCAN_FIELDS(SeqScan); + + READ_DONE(); +} + + +/* + * _readIndexScan + */ +static IndexScan * +_readIndexScan(void) +{ + READ_SCAN_FIELDS(IndexScan); + + if (portable_input) + READ_RELID_FIELD(indexid); + else + READ_OID_FIELD(indexid); + READ_NODE_FIELD(indexqual); + READ_NODE_FIELD(indexqualorig); + READ_NODE_FIELD(indexorderby); + READ_NODE_FIELD(indexorderbyorig); + READ_ENUM_FIELD(indexorderdir, ScanDirection); + + READ_DONE(); +} + + +/* + * _readIndexOnlyScan + */ +static IndexOnlyScan * +_readIndexOnlyScan(void) +{ + READ_SCAN_FIELDS(IndexOnlyScan); + + if (portable_input) + READ_RELID_FIELD(indexid); + else + READ_OID_FIELD(indexid); + READ_NODE_FIELD(indexqual); + READ_NODE_FIELD(indexorderby); + READ_NODE_FIELD(indextlist); + READ_ENUM_FIELD(indexorderdir, ScanDirection); + + READ_DONE(); +} + + +/* + * _readBitmapIndexScan + */ +static BitmapIndexScan * +_readBitmapIndexScan(void) +{ + READ_SCAN_FIELDS(BitmapIndexScan); + + if (portable_input) + READ_RELID_FIELD(indexid); + else + READ_OID_FIELD(indexid); + READ_NODE_FIELD(indexqual); + READ_NODE_FIELD(indexqualorig); + + READ_DONE(); +} + + +/* + * _readBitmapHeapScan + */ +static BitmapHeapScan * +_readBitmapHeapScan(void) +{ + READ_SCAN_FIELDS(BitmapHeapScan); + + READ_NODE_FIELD(bitmapqualorig); + + READ_DONE(); +} + + +/* + * _readTidScan + */ +static TidScan * +_readTidScan(void) +{ + READ_SCAN_FIELDS(TidScan); + + READ_NODE_FIELD(tidquals); + + READ_DONE(); +} + + +/* + * _readSubqueryScan + */ +static SubqueryScan * +_readSubqueryScan(void) +{ + READ_SCAN_FIELDS(SubqueryScan); + + READ_NODE_FIELD(subplan); + + READ_DONE(); +} + + +/* + * _readFunctionScan + */ +static FunctionScan * +_readFunctionScan(void) +{ + READ_SCAN_FIELDS(FunctionScan); + + READ_NODE_FIELD(funcexpr); + READ_NODE_FIELD(funccolnames); + READ_NODE_FIELD(funccoltypes); + READ_NODE_FIELD(funccoltypmods); + READ_NODE_FIELD(funccolcollations); + + READ_DONE(); +} + + +/* + * _readValuesScan + */ +static ValuesScan * +_readValuesScan(void) +{ + READ_SCAN_FIELDS(ValuesScan); + + READ_NODE_FIELD(values_lists); + + READ_DONE(); +} + + +/* + * _readCteScan + */ +static CteScan * +_readCteScan(void) +{ + READ_SCAN_FIELDS(CteScan); + + READ_INT_FIELD(ctePlanId); + READ_INT_FIELD(cteParam); + + READ_DONE(); +} + + +/* + * _readWorkTableScan + */ +static WorkTableScan * +_readWorkTableScan(void) +{ + READ_SCAN_FIELDS(WorkTableScan); + + READ_INT_FIELD(wtParam); + + READ_DONE(); +} + + +/* + * _readJoin + */ +static Join * +_readJoin(void) +{ + READ_JOIN_FIELDS(Join); + + READ_DONE(); +} + + +/* + * _readNestLoop + */ +static NestLoop * +_readNestLoop(void) +{ + READ_JOIN_FIELDS(NestLoop); + + READ_NODE_FIELD(nestParams); + + READ_DONE(); +} + + +/* + * _readMergeJoin + */ +static MergeJoin * +_readMergeJoin(void) +{ + int numCols; + int i; + READ_JOIN_FIELDS(MergeJoin); + + READ_NODE_FIELD(mergeclauses); + numCols = list_length(local_node->mergeclauses); + + + token = pg_strtok(&length); /* skip :mergeFamilies */ + local_node->mergeFamilies = (Oid *) palloc(numCols * sizeof(Oid)); + for (i = 0; i < numCols; i++) + { + token = pg_strtok(&length); + local_node->mergeFamilies[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :mergeCollations */ + local_node->mergeCollations = (Oid *) palloc(numCols * sizeof(Oid)); + for (i = 0; i < numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *collname; /* collation name */ + int collencoding; /* collation encoding */ + /* the token is already read */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get collname */ + collname = nullable_string(token, length); + token = pg_strtok(&length); /* get nargs */ + collencoding = atoi(token); + if (collname) + local_node->mergeCollations[i] = get_collid(collname, + collencoding, + NSP_OID(nspname)); + else + local_node->mergeCollations[i] = InvalidOid; + } + else + local_node->mergeCollations[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :mergeStrategies */ + local_node->mergeStrategies = (int *) palloc(numCols * sizeof(int)); + for (i = 0; i < numCols; i++) + { + token = pg_strtok(&length); + local_node->mergeStrategies[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :mergeNullsFirst */ + local_node->mergeNullsFirst = (bool *) palloc(numCols * sizeof(bool)); + for (i = 0; i < numCols; i++) + { + token = pg_strtok(&length); + local_node->mergeNullsFirst[i] = strtobool(token); + } + + READ_DONE(); +} + + +/* + * _readHashJoin + */ +static HashJoin * +_readHashJoin(void) +{ + READ_JOIN_FIELDS(HashJoin); + + READ_NODE_FIELD(hashclauses); + + READ_DONE(); +} + + +/* + * _readMaterial + */ +static Material * +_readMaterial(void) +{ + READ_PLAN_FIELDS(Material); + + READ_DONE(); +} + + +/* + * _readSort + */ +static Sort * +_readSort(void) +{ + int i; + READ_PLAN_FIELDS(Sort); + + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :sortColIdx */ + local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->sortColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :sortOperators */ + local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->sortOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->sortOperators[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :collations */ + local_node->collations = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *collname; /* collation name */ + int collencoding; /* collation encoding */ + /* the token is already read */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get collname */ + collname = nullable_string(token, length); + token = pg_strtok(&length); /* get nargs */ + collencoding = atoi(token); + if (collname) + local_node->collations[i] = get_collid(collname, + collencoding, + NSP_OID(nspname)); + else + local_node->collations[i] = InvalidOid; + } + else + local_node->collations[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :nullsFirst */ + local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->nullsFirst[i] = strtobool(token); + } + + READ_DONE(); +} + + +/* + * _readGroup + */ +static Group * +_readGroup(void) +{ + int i; + READ_PLAN_FIELDS(Group); + + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :grpColIdx */ + local_node->grpColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->grpColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :grpOperators */ + local_node->grpOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->grpOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->grpOperators[i] = atooid(token); + } + + READ_DONE(); +} + + +/* + * _readAgg + */ +static Agg * +_readAgg(void) +{ + int i; + READ_PLAN_FIELDS(Agg); + + READ_ENUM_FIELD(aggstrategy, AggStrategy); + READ_ENUM_FIELD(aggdistribution, AggDistribution); + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :grpColIdx */ + local_node->grpColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->grpColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :grpOperators */ + local_node->grpOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->grpOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->grpOperators[i] = atooid(token); + } + + READ_LONG_FIELD(numGroups); + + READ_DONE(); +} + + +/* + * _readWindowAgg + */ +static WindowAgg * +_readWindowAgg(void) +{ + int i; + READ_PLAN_FIELDS(WindowAgg); + + READ_INT_FIELD(winref); + READ_INT_FIELD(partNumCols); + + token = pg_strtok(&length); /* skip :partColIdx */ + local_node->partColIdx = (AttrNumber *) palloc(local_node->partNumCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->partNumCols; i++) + { + token = pg_strtok(&length); + local_node->partColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :partOperators */ + local_node->partOperators = (Oid *) palloc(local_node->partNumCols * sizeof(Oid)); + for (i = 0; i < local_node->partNumCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->partOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->partOperators[i] = atooid(token); + } + + READ_INT_FIELD(ordNumCols); + + token = pg_strtok(&length); /* skip :ordColIdx */ + local_node->ordColIdx = (AttrNumber *) palloc(local_node->ordNumCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->ordNumCols; i++) + { + token = pg_strtok(&length); + local_node->ordColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :ordOperators */ + local_node->ordOperators = (Oid *) palloc(local_node->ordNumCols * sizeof(Oid)); + for (i = 0; i < local_node->ordNumCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->ordOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->ordOperators[i] = atooid(token); + } + + READ_INT_FIELD(frameOptions); + READ_NODE_FIELD(startOffset); + READ_NODE_FIELD(endOffset); + + READ_DONE(); +} + + +/* + * _readUnique + */ +static Unique * +_readUnique(void) +{ + int i; + READ_PLAN_FIELDS(Unique); + + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :uniqColIdx */ + local_node->uniqColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->uniqColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :uniqOperators */ + local_node->uniqOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->uniqOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->uniqOperators[i] = atooid(token); + } + + READ_DONE(); +} + + +/* + * _readHash + */ +static Hash * +_readHash(void) +{ + READ_PLAN_FIELDS(Hash); + + if (portable_input) + READ_RELID_FIELD(skewTable); + else + READ_OID_FIELD(skewTable); + READ_INT_FIELD(skewColumn); + READ_BOOL_FIELD(skewInherit); + if (portable_input) + READ_TYPID_FIELD(skewColType); + else + READ_OID_FIELD(skewColType); + READ_INT_FIELD(skewColTypmod); + + READ_DONE(); +} + + +/* + * _readSetOp + */ +static SetOp * +_readSetOp(void) +{ + int i; + READ_PLAN_FIELDS(SetOp); + + READ_ENUM_FIELD(cmd, SetOpCmd); + READ_ENUM_FIELD(strategy, SetOpStrategy); + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :dupColIdx */ + local_node->dupColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->dupColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :dupOperators */ + local_node->dupOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->dupOperators[i] = atooid(token); + } + + READ_INT_FIELD(flagColIdx); + READ_INT_FIELD(firstFlag); + READ_LONG_FIELD(numGroups); + + READ_DONE(); +} + + +/* + * _readLimit + */ +static Limit * +_readLimit(void) +{ + READ_PLAN_FIELDS(Limit); + + READ_NODE_FIELD(limitOffset); + READ_NODE_FIELD(limitCount); + + READ_DONE(); +} + + +/* + * _readRemoteSubplan + */ +static RemoteSubplan * +_readRemoteSubplan(void) +{ + READ_SCAN_FIELDS(RemoteSubplan); + + READ_CHAR_FIELD(distributionType); + READ_INT_FIELD(distributionKey); + READ_NODE_FIELD(distributionNodes); + READ_NODE_FIELD(distributionRestrict); + READ_NODE_FIELD(nodeList); + READ_BOOL_FIELD(execOnAll); + READ_NODE_FIELD(sort); + READ_STRING_FIELD(cursor); + READ_INT_FIELD(unique); + + READ_DONE(); +} + + +/* + * _readRemoteStmt + */ +static RemoteStmt * +_readRemoteStmt(void) +{ + int i; + READ_LOCALS(RemoteStmt); + + READ_ENUM_FIELD(commandType, CmdType); + READ_BOOL_FIELD(hasReturning); + READ_NODE_FIELD(planTree); + READ_NODE_FIELD(rtable); + READ_NODE_FIELD(resultRelations); + READ_NODE_FIELD(subplans); + READ_INT_FIELD(nParamExec); + READ_INT_FIELD(nParamRemote); + if (local_node->nParamRemote > 0) + { + local_node->remoteparams = (RemoteParam *) palloc( + local_node->nParamRemote * sizeof(RemoteParam)); + for (i = 0; i < local_node->nParamRemote; i++) + { + RemoteParam *rparam = &(local_node->remoteparams[i]); + token = pg_strtok(&length); /* skip :paramkind */ + token = pg_strtok(&length); + rparam->paramkind = (ParamKind) atoi(token); + + token = pg_strtok(&length); /* skip :paramid */ + token = pg_strtok(&length); + rparam->paramid = atoi(token); + + token = pg_strtok(&length); /* skip :paramtype */ + if (portable_input) + { + char *nspname; /* namespace name */ + char *typname; /* data type name */ + token = pg_strtok(&length); /* get nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get typname */ + typname = nullable_string(token, length); + if (typname) + rparam->paramtype = get_typname_typid(typname, + NSP_OID(nspname)); + else + rparam->paramtype = InvalidOid; + } + else + { + token = pg_strtok(&length); + rparam->paramtype = atooid(token); + } + } + } + else + local_node->remoteparams = NULL; + + READ_NODE_FIELD(rowMarks); + READ_CHAR_FIELD(distributionType); + READ_INT_FIELD(distributionKey); + READ_NODE_FIELD(distributionNodes); + READ_NODE_FIELD(distributionRestrict); + + READ_DONE(); +} + + +/* + * _readSimpleSort + */ +static SimpleSort * +_readSimpleSort(void) +{ + int i; + READ_LOCALS(SimpleSort); + + READ_INT_FIELD(numCols); + + token = pg_strtok(&length); /* skip :sortColIdx */ + local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->sortColIdx[i] = atoi(token); + } + + token = pg_strtok(&length); /* skip :sortOperators */ + local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *oprname; /* operator name */ + char *leftnspname; /* left type namespace */ + char *leftname; /* left type name */ + Oid oprleft; /* left type */ + char *rightnspname; /* right type namespace */ + char *rightname; /* right type name */ + Oid oprright; /* right type */ + /* token is already set to nspname */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get operator name */ + oprname = nullable_string(token, length); + token = pg_strtok(&length); /* left type namespace */ + leftnspname = nullable_string(token, length); + token = pg_strtok(&length); /* left type name */ + leftname = nullable_string(token, length); + token = pg_strtok(&length); /* right type namespace */ + rightnspname = nullable_string(token, length); + token = pg_strtok(&length); /* right type name */ + rightname = nullable_string(token, length); + if (leftname) + oprleft = get_typname_typid(leftname, + NSP_OID(leftnspname)); + else + oprleft = InvalidOid; + if (rightname) + oprright = get_typname_typid(rightname, + NSP_OID(rightnspname)); + else + oprright = InvalidOid; + local_node->sortOperators[i] = get_operid(oprname, + oprleft, + oprright, + NSP_OID(nspname)); + } + else + local_node->sortOperators[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :sortCollations */ + local_node->sortCollations = (Oid *) palloc(local_node->numCols * sizeof(Oid)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + if (portable_input) + { + char *nspname; /* namespace name */ + char *collname; /* collation name */ + int collencoding; /* collation encoding */ + /* the token is already read */ + nspname = nullable_string(token, length); + token = pg_strtok(&length); /* get collname */ + collname = nullable_string(token, length); + token = pg_strtok(&length); /* get nargs */ + collencoding = atoi(token); + if (collname) + local_node->sortCollations[i] = get_collid(collname, + collencoding, + NSP_OID(nspname)); + else + local_node->sortCollations[i] = InvalidOid; + } + else + local_node->sortCollations[i] = atooid(token); + } + + token = pg_strtok(&length); /* skip :nullsFirst */ + local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool)); + for (i = 0; i < local_node->numCols; i++) + { + token = pg_strtok(&length); + local_node->nullsFirst[i] = strtobool(token); + } + + READ_DONE(); +} + + +/* + * _readNestLoopParam + */ +static NestLoopParam * +_readNestLoopParam(void) +{ + READ_LOCALS(NestLoopParam); + + READ_INT_FIELD(paramno); + READ_NODE_FIELD(paramval); + + READ_DONE(); +} + + +/* + * _readPlanRowMark + */ +static PlanRowMark * +_readPlanRowMark(void) +{ + READ_LOCALS(PlanRowMark); + + READ_UINT_FIELD(rti); + READ_UINT_FIELD(prti); + READ_UINT_FIELD(rowmarkId); + READ_ENUM_FIELD(markType, RowMarkType); + READ_BOOL_FIELD(noWait); + READ_BOOL_FIELD(isParent); + + READ_DONE(); +} + +/* + * _readLockRows + */ +static LockRows * +_readLockRows(void) +{ + READ_PLAN_FIELDS(LockRows); + + READ_NODE_FIELD(rowMarks); + READ_INT_FIELD(epqParam); + + READ_DONE(); +} + +#endif /* XCP */ + + /* * parseNodeString * @@ -1314,6 +3295,10 @@ parseNodeString(void) return_value = _readBoolExpr(); else if (MATCH("SUBLINK", 7)) return_value = _readSubLink(); +#ifdef XCP + else if (MATCH("SUBPLAN", 7)) + return_value = _readSubPlan(); +#endif else if (MATCH("FIELDSELECT", 11)) return_value = _readFieldSelect(); else if (MATCH("FIELDSTORE", 10)) @@ -1372,6 +3357,86 @@ parseNodeString(void) return_value = _readNotifyStmt(); else if (MATCH("DECLARECURSOR", 13)) return_value = _readDeclareCursorStmt(); +#ifdef XCP + else if (MATCH("PLAN", 4)) + return_value = _readPlan(); + else if (MATCH("RESULT", 6)) + return_value = _readResult(); + else if (MATCH("MODIFYTABLE", 11)) + return_value = _readModifyTable(); + else if (MATCH("APPEND", 6)) + return_value = _readAppend(); + else if (MATCH("MERGEAPPEND", 11)) + return_value = _readMergeAppend(); + else if (MATCH("RECURSIVEUNION", 14)) + return_value = _readRecursiveUnion(); + else if (MATCH("BITMAPAND", 9)) + return_value = _readBitmapAnd(); + else if (MATCH("BITMAPOR", 8)) + return_value = _readBitmapOr(); + else if (MATCH("SCAN", 4)) + return_value = _readScan(); + else if (MATCH("SEQSCAN", 7)) + return_value = _readSeqScan(); + else if (MATCH("INDEXSCAN", 9)) + return_value = _readIndexScan(); + else if (MATCH("INDEXONLYSCAN", 13)) + return_value = _readIndexOnlyScan(); + else if (MATCH("BITMAPINDEXSCAN", 15)) + return_value = _readBitmapIndexScan(); + else if (MATCH("BITMAPHEAPSCAN", 14)) + return_value = _readBitmapHeapScan(); + else if (MATCH("TIDSCAN", 7)) + return_value = _readTidScan(); + else if (MATCH("SUBQUERYSCAN", 12)) + return_value = _readSubqueryScan(); + else if (MATCH("FUNCTIONSCAN", 12)) + return_value = _readFunctionScan(); + else if (MATCH("VALUESSCAN", 10)) + return_value = _readValuesScan(); + else if (MATCH("CTESCAN", 7)) + return_value = _readCteScan(); + else if (MATCH("WORKTABLESCAN", 13)) + return_value = _readWorkTableScan(); + else if (MATCH("JOIN", 4)) + return_value = _readJoin(); + else if (MATCH("NESTLOOP", 8)) + return_value = _readNestLoop(); + else if (MATCH("MERGEJOIN", 9)) + return_value = _readMergeJoin(); + else if (MATCH("HASHJOIN", 8)) + return_value = _readHashJoin(); + else if (MATCH("MATERIAL", 8)) + return_value = _readMaterial(); + else if (MATCH("SORT", 4)) + return_value = _readSort(); + else if (MATCH("GROUP", 5)) + return_value = _readGroup(); + else if (MATCH("AGG", 3)) + return_value = _readAgg(); + else if (MATCH("WINDOWAGG", 9)) + return_value = _readWindowAgg(); + else if (MATCH("UNIQUE", 6)) + return_value = _readUnique(); + else if (MATCH("HASH", 4)) + return_value = _readHash(); + else if (MATCH("SETOP", 5)) + return_value = _readSetOp(); + else if (MATCH("LIMIT", 5)) + return_value = _readLimit(); + else if (MATCH("REMOTESUBPLAN", 13)) + return_value = _readRemoteSubplan(); + else if (MATCH("REMOTESTMT", 10)) + return_value = _readRemoteStmt(); + else if (MATCH("SIMPLESORT", 10)) + return_value = _readSimpleSort(); + else if (MATCH("NESTLOOPPARAM", 13)) + return_value = _readNestLoopParam(); + else if (MATCH("PLANROWMARK", 11)) + return_value = _readPlanRowMark(); + else if (MATCH("LOCKROWS", 8)) + return_value = _readLockRows(); +#endif else { elog(ERROR, "badly formatted node string \"%.32s\"...", token); @@ -1445,3 +3510,49 @@ readDatum(bool typbyval) return res; } + +#ifdef XCP +/* + * scanDatum + * + * Recreate Datum from the text format understandable by the input function + * of the specified data type. + */ +static Datum +scanDatum(Oid typid, int typmod) +{ + Oid typInput; + Oid typioparam; + FmgrInfo finfo; + FunctionCallInfoData fcinfo; + char *value; + Datum res; + READ_TEMP_LOCALS(); + + /* Get input function for the type */ + getTypeInputInfo(typid, &typInput, &typioparam); + fmgr_info(typInput, &finfo); + + /* Read the value */ + token = pg_strtok(&length); + value = nullable_string(token, length); + + /* The value can not be NULL, so we actually received empty string */ + if (value == NULL) + value = ""; + + /* Invoke input function */ + InitFunctionCallInfoData(fcinfo, &finfo, 3, InvalidOid, NULL, NULL); + + fcinfo.arg[0] = CStringGetDatum(value); + fcinfo.arg[1] = ObjectIdGetDatum(typioparam); + fcinfo.arg[2] = Int32GetDatum(typmod); + fcinfo.argnull[0] = false; + fcinfo.argnull[1] = false; + fcinfo.argnull[2] = false; + + res = FunctionCallInvoke(&fcinfo); + + return res; +} +#endif diff --git a/src/backend/optimizer/path/Makefile b/src/backend/optimizer/path/Makefile index 0d9ffe58a7..07938dbe57 100644 --- a/src/backend/optimizer/path/Makefile +++ b/src/backend/optimizer/path/Makefile @@ -13,7 +13,6 @@ top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global OBJS = allpaths.o clausesel.o costsize.o equivclass.o indxpath.o \ - joinpath.o joinrels.o orindxpath.o pathkeys.o tidpath.o \ - pgxcpath.o + joinpath.o joinrels.o orindxpath.o pathkeys.o tidpath.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 1331de75fc..525a659007 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -3,6 +3,11 @@ * allpaths.c * Routines to find possible search paths for processing a query * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -36,6 +41,16 @@ #include "optimizer/var.h" #include "parser/parse_clause.h" #include "parser/parsetree.h" +#ifdef PGXC +#ifdef XCP +#include "nodes/makefuncs.h" +#include "miscadmin.h" +#else +#include "catalog/pg_namespace.h" +#include "catalog/pg_class.h" +#include "pgxc/pgxc.h" +#endif /* XCP */ +#endif /* PGXC */ #include "rewrite/rewriteManip.h" #include "utils/lsyscache.h" @@ -378,9 +393,22 @@ static void set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) { #ifdef PGXC - if (!create_plainrel_rqpath(root, rel, rte)) +#ifndef XCP + /* + * If we are on the Coordinator, we always want to use + * the remote query path unless it is a pg_catalog table + * or a sequence relation. + */ + if (IS_PGXC_COORDINATOR && + !IsConnFromCoord() && + get_rel_namespace(rte->relid) != PG_CATALOG_NAMESPACE && + get_rel_relkind(rte->relid) != RELKIND_SEQUENCE && + !root->parse->is_local) + add_path(rel, create_remotequery_path(root, rel)); + else { -#endif +#endif /* XCP */ +#endif /* PGXC */ /* Consider sequential scan */ add_path(rel, create_seqscan_path(root, rel, NULL)); @@ -391,8 +419,10 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) /* Consider TID scans */ create_tidscan_paths(root, rel); #ifdef PGXC +#ifndef XCP } -#endif +#endif /* XCP */ +#endif /* PGXC */ /* Now find the cheapest of the paths for this rel */ set_cheapest(rel); @@ -1035,6 +1065,9 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, double tuple_fraction; PlannerInfo *subroot; List *pathkeys; +#ifdef XCP + Distribution *distribution; +#endif /* * Must copy the Query so that planning doesn't mess up the RTE contents @@ -1144,7 +1177,53 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, pathkeys = convert_subquery_pathkeys(root, rel, subroot->query_pathkeys); /* Generate appropriate path */ +#ifdef XCP + if (subroot->distribution && subroot->distribution->distributionExpr) + { + ListCell *lc; + /* + * The distribution expression from the subplan's tlist, but it should + * be from the rel, need conversion. + */ + distribution = makeNode(Distribution); + distribution->distributionType = subroot->distribution->distributionType; + distribution->nodes = bms_copy(subroot->distribution->nodes); + distribution->restrictNodes = bms_copy(subroot->distribution->restrictNodes); + foreach(lc, rel->subplan->targetlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(lc); + if (equal(tle->expr, subroot->distribution->distributionExpr)) + { + distribution->distributionExpr = (Node *) + makeVarFromTargetEntry(rel->relid, tle); + break; + } + } + } + else + distribution = subroot->distribution; + add_path(rel, create_subqueryscan_path(root, rel, pathkeys, NULL, + distribution)); + + /* + * Temporarily block ORDER BY in subqueries until we can add support + * it in Postgres-XL without outputting incorrect results. Should + * do this only in normal processing mode though! + * + * The extra conditions below try to handle cases where an ORDER BY + * appears in a simple VIEW or INSERT SELECT. + */ + if (IsUnderPostmaster && + list_length(subquery->sortClause) > 1 + && (subroot->parent_root != root + || (subroot->parent_root == root + && (root->parse->commandType != CMD_SELECT + || (root->parse->commandType == CMD_SELECT + && root->parse->hasWindowFuncs))))) + elog(ERROR, "Postgres-XL does not currently support ORDER BY in subqueries"); +#else add_path(rel, create_subqueryscan_path(root, rel, pathkeys, NULL)); +#endif /* Select cheapest path (pretty easy in this case...) */ set_cheapest(rel); diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 0f5f72d9a3..ba71c15594 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -57,6 +57,11 @@ * values. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -97,6 +102,10 @@ double random_page_cost = DEFAULT_RANDOM_PAGE_COST; double cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST; double cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST; double cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST; +#ifdef XCP +double network_byte_cost = DEFAULT_NETWORK_BYTE_COST; +double remote_query_cost = DEFAULT_REMOTE_QUERY_COST; +#endif int effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE; @@ -114,11 +123,8 @@ bool enable_material = true; bool enable_mergejoin = true; bool enable_hashjoin = true; #ifdef PGXC -bool enable_fast_query_shipping = true; bool enable_remotejoin = true; bool enable_remotegroup = true; -bool enable_remotesort = true; -bool enable_remotelimit = true; #endif typedef struct @@ -2242,6 +2248,15 @@ final_cost_mergejoin(PlannerInfo *root, MergePath *path, relation_byte_size(inner_path_rows, inner_path->parent->width) > (work_mem * 1024L)) path->materialize_inner = true; +#ifdef XCP + /* + * Even if innersortkeys are specified, we never add the Sort node on top + * of RemoteSubplan, instead we set up internal sorter. + * Since RemoteSubplan does not support mark/restore we must materialize it + */ + else if (inner_path->pathtype == T_RemoteSubplan) + path->materialize_inner = true; +#endif else path->materialize_inner = false; @@ -2850,22 +2865,6 @@ cost_rescan(PlannerInfo *root, Path *path, } } -#ifdef PGXC -/* - * cost_remotequery - * As of now the function just sets the costs to 0 to make this path the - * cheapest. - * PGXC_TODO: Ideally, we should estimate the costs of network transfer from - * datanodes and any datanode costs involved. - */ -void -cost_remotequery(RemoteQueryPath *rqpath, PlannerInfo *root, RelOptInfo *rel) -{ - rqpath->path.startup_cost = 0; - rqpath->path.total_cost = 0; - rqpath->path.rows = rel->rows; -} -#endif /* PGXC */ /* * cost_qual_eval @@ -4032,3 +4031,30 @@ page_size(double tuples, int width) { return ceil(relation_byte_size(tuples, width) / BLCKSZ); } + + +#ifdef XCP +void +cost_remote_subplan(Path *path, + Cost input_startup_cost, Cost input_total_cost, + double tuples, int width, int replication) +{ + Cost startup_cost = input_startup_cost + remote_query_cost; + Cost run_cost = input_total_cost - input_startup_cost; + + path->rows = tuples; + + /* + * Charge 2x cpu_operator_cost per tuple to reflect bookkeeping overhead. + */ + run_cost += 2 * cpu_operator_cost * tuples; + + /* + * Estimate cost of sending data over network + */ + run_cost += network_byte_cost * tuples * width * replication; + + path->startup_cost = startup_cost; + path->total_cost = startup_cost + run_cost; +} +#endif diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index 0463ec92b4..65f86194e1 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -196,16 +196,6 @@ add_paths_to_joinrel(PlannerInfo *root, hash_inner_and_outer(root, joinrel, outerrel, innerrel, restrictlist, jointype, sjinfo, &semifactors, param_source_rels); - -#ifdef PGXC - /* - * If the inner and outer relations have RemoteQuery paths, check if this - * JOIN can be pushed to the data-nodes. If so, create a RemoteQuery path - * corresponding to the this JOIN. - */ - create_joinrel_rqpath(root, joinrel, outerrel, innerrel, restrictlist, - jointype, sjinfo); -#endif /* PGXC */ } /* @@ -1338,4 +1328,3 @@ select_mergejoin_clauses(PlannerInfo *root, return result_list; } - diff --git a/src/backend/optimizer/plan/Makefile b/src/backend/optimizer/plan/Makefile index 759a669ef5..88a9f7ff8c 100644 --- a/src/backend/optimizer/plan/Makefile +++ b/src/backend/optimizer/plan/Makefile @@ -13,6 +13,6 @@ top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global OBJS = analyzejoins.o createplan.o initsplan.o planagg.o planmain.o planner.o \ - setrefs.o subselect.o pgxcplan.o + setrefs.o subselect.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 0c9c52e685..88a6ca1eae 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -5,6 +5,11 @@ * Planning is complete, we just need to convert the selected * Path into a Plan. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -37,6 +42,29 @@ #include "optimizer/var.h" #include "parser/parse_clause.h" #include "parser/parsetree.h" +#ifdef PGXC +#include "access/gtm.h" +#include "parser/parse_coerce.h" +#include "pgxc/pgxc.h" +#include "pgxc/planner.h" +#include "pgxc/postgresql_fdw.h" +#include "access/sysattr.h" +#include "utils/builtins.h" +#include "utils/rel.h" +#include "utils/syscache.h" +#include "catalog/pg_proc.h" +#include "catalog/pg_type.h" +#include "executor/executor.h" +#ifdef XCP +#include "access/gtm.h" +#include "catalog/pg_aggregate.h" +#include "parser/parse_coerce.h" +#else +#include "rewrite/rewriteManip.h" +#endif /* XCP */ +#include "commands/prepare.h" +#include "commands/tablecmds.h" +#endif /* PGXC */ #include "utils/lsyscache.h" @@ -52,6 +80,13 @@ static Plan *create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_p static Result *create_result_plan(PlannerInfo *root, ResultPath *best_path); static Material *create_material_plan(PlannerInfo *root, MaterialPath *best_path); static Plan *create_unique_plan(PlannerInfo *root, UniquePath *best_path); +#ifdef XCP +static void adjustSubplanDistribution(PlannerInfo *root, Distribution *pathd, + Distribution *subd); +static RemoteSubplan *create_remotescan_plan(PlannerInfo *root, + RemoteSubPath *best_path); +static char *get_internal_cursor(void); +#endif static SeqScan *create_seqscan_plan(PlannerInfo *root, Path *best_path, List *tlist, List *scan_clauses); static Scan *create_indexscan_plan(PlannerInfo *root, IndexPath *best_path, @@ -73,6 +108,32 @@ static CteScan *create_ctescan_plan(PlannerInfo *root, Path *best_path, List *tlist, List *scan_clauses); static WorkTableScan *create_worktablescan_plan(PlannerInfo *root, Path *best_path, List *tlist, List *scan_clauses); +#ifdef PGXC +#ifndef XCP +static RowMarkClause *mk_row_mark_clause(PlanRowMark *prm); +static bool compare_alias(Alias *a1, Alias *a2); +static Plan *create_remotequery_plan(PlannerInfo *root, Path *best_path, + List *tlist, List *scan_clauses); +static Plan *create_remotejoin_plan(PlannerInfo *root, JoinPath *best_path, + Plan *parent, Plan *outer_plan, Plan *inner_plan); +static List *create_remote_target_list(PlannerInfo *root, + StringInfo targets, List *out_tlist, List *in_tlist, + char *out_alias, int out_index, + char *in_alias, int in_index); +static Alias *generate_remote_rte_alias(RangeTblEntry *rte, int varno, + char *aliasname, int reduce_level); +static void pgxc_locate_grouping_columns(PlannerInfo *root, List *tlist, + AttrNumber *grpColIdx); +static List *pgxc_process_grouping_targetlist(PlannerInfo *root, + List **local_tlist); +static List *pgxc_process_having_clause(PlannerInfo *root, List *remote_tlist, + Node *havingQual, List **local_qual, + List **remote_qual, bool *reduce_plan); +static Expr *pgxc_set_en_expr(Oid tableoid, Index resultRelationIndex); +static int pgxc_count_rowmarks_entries(List *rowMarks); +static Oid *pgxc_build_rowmark_entries(List *rowMarks, List *rtable, Oid *types, int prepparams, int totparams); +#endif /* XCP */ +#endif /* PGXC */ static ForeignScan *create_foreignscan_plan(PlannerInfo *root, ForeignPath *best_path, List *tlist, List *scan_clauses); static NestLoop *create_nestloop_plan(PlannerInfo *root, NestPath *best_path, @@ -120,6 +181,12 @@ static CteScan *make_ctescan(List *qptlist, List *qpqual, Index scanrelid, int ctePlanId, int cteParam); static WorkTableScan *make_worktablescan(List *qptlist, List *qpqual, Index scanrelid, int wtParam); +#ifdef PGXC +#ifndef XCP +static RemoteQuery *make_remotequery(List *qptlist, List *qpqual, + Index scanrelid); +#endif +#endif static BitmapAnd *make_bitmap_and(List *bitmapplans); static BitmapOr *make_bitmap_or(List *bitmapplans); static NestLoop *make_nestloop(List *tlist, @@ -165,6 +232,22 @@ static EquivalenceMember *find_ec_member_for_tle(EquivalenceClass *ec, Relids relids); static Material *make_material(Plan *lefttree); +#ifdef PGXC +#ifndef XCP +static void findReferencedVars(List *parent_vars, RemoteQuery *plan, List **out_tlist, Relids *out_relids); +static void create_remote_clause_expr(PlannerInfo *root, Plan *parent, StringInfo clauses, + List *qual, RemoteQuery *scan); +static void create_remote_expr(PlannerInfo *root, Plan *parent, StringInfo expr, + Node *node, RemoteQuery *scan); +#endif /* XCP */ +#endif /* PGXC */ + +#ifdef XCP +static int add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll, + bool nulls_first,int numCols, AttrNumber *sortColIdx, + Oid *sortOperators, Oid *collations, bool *nullsFirst); +#endif + /* * create_plan * Creates the access plan for a query by recursively processing the @@ -188,6 +271,11 @@ create_plan(PlannerInfo *root, Path *best_path) /* Initialize this module's private workspace in PlannerInfo */ root->curOuterRels = NULL; root->curOuterParams = NIL; +#ifdef XCP + root->curOuterRestrict = NULL; + adjustSubplanDistribution(root, root->distribution, + best_path->distribution); +#endif /* Recursively process the path tree */ plan = create_plan_recurse(root, best_path); @@ -221,8 +309,19 @@ create_plan_recurse(PlannerInfo *root, Path *best_path) case T_CteScan: case T_WorkTableScan: case T_ForeignScan: +#ifdef PGXC +#ifndef XCP + case T_RemoteQuery: +#endif /* XCP */ +#endif /* PGXC */ plan = create_scan_plan(root, best_path); break; +#ifdef XCP + case T_RemoteSubplan: + plan = (Plan *) create_remotescan_plan(root, + (RemoteSubPath *) best_path); + break; +#endif case T_HashJoin: case T_MergeJoin: case T_NestLoop: @@ -249,12 +348,6 @@ create_plan_recurse(PlannerInfo *root, Path *best_path) plan = create_unique_plan(root, (UniquePath *) best_path); break; -#ifdef PGXC - case T_RemoteQuery: - plan = create_remotequery_plan(root, - (RemoteQueryPath *)best_path); - break; -#endif default: elog(ERROR, "unrecognized node type: %d", (int) best_path->pathtype); @@ -394,6 +487,19 @@ create_scan_plan(PlannerInfo *root, Path *best_path) scan_clauses); break; +#ifdef PGXC +#ifndef XCP + case T_RemoteQuery: + /* For RemoteQuery path always use relation tlist */ + tlist = build_relation_tlist(rel); + plan = (Plan *) create_remotequery_plan(root, + best_path, + tlist, + scan_clauses); + break; +#endif /* XCP */ +#endif /* PGXC */ + case T_ForeignScan: plan = (Plan *) create_foreignscan_plan(root, (ForeignPath *) best_path, @@ -644,9 +750,642 @@ create_join_plan(PlannerInfo *root, JoinPath *best_path) list_concat(get_qpqual((Plan) plan), get_actual_clauses(get_loc_restrictinfo(best_path)))); #endif + +#ifdef PGXC +#ifndef XCP + /* + * Check if this join can be reduced to an equiv. remote scan node + * This can only be executed on a remote Coordinator + */ + if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + plan = create_remotejoin_plan(root, best_path, plan, outer_plan, inner_plan); +#endif /* XCP */ +#endif /* PGXC */ + return plan; } + +#ifdef PGXC +#ifndef XCP +/* + * create_remotejoin_plan + * check if the children plans involve remote entities from the same remote + * node. If so, this join can be reduced to an equivalent remote scan plan + * node + * + * RULES: + * + * * provide unique aliases to both inner and outer nodes to represent their + * corresponding subqueries + * + * * identify target entries from both inner and outer that appear in the join + * targetlist, only those need to be selected from these aliased subqueries + * + * * a join node has a joinqual list which represents the join condition. E.g. + * SELECT * from emp e LEFT JOIN emp2 d ON e.x = d.x + * Here the joinqual contains "e.x = d.x". If the joinqual itself has a local + * dependency, e.g "e.x = localfunc(d.x)", then this join cannot be reduced + * + * * other than the joinqual, the join node can contain additional quals. Even + * if they have any local dependencies, we can reduce the join and just + * append these quals into the reduced remote scan node. We DO do a pass to + * identify remote quals and ship those in the squery though + * + * * these quals (both joinqual and normal quals with no local dependencies) + * need to be converted into expressions referring to the aliases assigned to + * the nodes. These expressions will eventually become part of the squery of + * the reduced remote scan node + * + * * the children remote scan nodes themselves can have local dependencies in + * their quals (the remote ones are already part of the squery). We can still + * reduce the join and just append these quals into the reduced remote scan + * node + * + * * if we reached successfully so far, generate a new remote scan node with + * this new squery generated using the aliased references + * + * One important point to note here about targetlists is that this function + * does not set any DUMMY var references in the Var nodes appearing in it. It + * follows the standard mechanism as is followed by other nodes. Similar to the + * existing nodes, the references which point to DUMMY vars is done in + * set_remote_references() function in set_plan_references phase at the fag + * end. Avoiding such DUMMY references manipulations till the end also makes + * this code a lot much readable and easier. + */ +static Plan * +create_remotejoin_plan(PlannerInfo *root, JoinPath *best_path, Plan *parent, Plan *outer_plan, Plan *inner_plan) +{ + NestLoop *nest_parent; + ExecNodes *join_exec_nodes; + RemoteQuery *outer; + RemoteQuery *inner; + + if (!enable_remotejoin) + return parent; + + /* meh, what are these for :( */ + if (root->hasPseudoConstantQuals) + return parent; + + /* do not optimize CURSOR based select statements */ + if (root->parse->rowMarks != NIL) + return parent; + + /* + * optimize only simple NestLoop joins for now. Other joins like Merge and + * Hash can be reduced too. But they involve additional intermediate nodes + * and we need to understand them a bit more as yet + */ + if (!IsA(parent, NestLoop)) + return parent; + else + nest_parent = (NestLoop *)parent; + + if (!IsA(outer_plan, RemoteQuery) || !IsA(inner_plan, RemoteQuery)) + return parent; + + outer = (RemoteQuery *)outer_plan; + inner = (RemoteQuery *)inner_plan; + + /* check if both the nodes qualify for reduction */ + if (!outer->scan.plan.qual && !inner->scan.plan.qual) + { + int i; + List *rtable_list = NIL; + List *parent_vars, *out_tlist = NIL, *in_tlist = NIL, *base_tlist; + Relids out_relids = NULL, in_relids = NULL; + + /* + * Check if both these plans are from the same remote node. If yes, + * replace this JOIN along with it's two children with one equivalent + * remote node + */ + + /* + * Build up rtable for XC Walker + * (was not sure I could trust this, but it seems to work in various cases) + */ + for (i = 0; i < root->simple_rel_array_size; i++) + { + RangeTblEntry *rte = root->simple_rte_array[i]; + + /* Check for NULL first, sometimes it is NULL at position 0 */ + if (rte) + rtable_list = lappend(rtable_list, root->simple_rte_array[i]); + } + /* + * Walk the left, right trees and identify which vars appear in the + * parent targetlist, only those need to be selected. Note that + * depending on whether the parent targetlist is top-level or + * intermediate, the children vars may or may not be referenced + * multiple times in it. + */ + parent_vars = pull_var_clause((Node *)parent->targetlist, + PVC_RECURSE_AGGREGATES, + PVC_RECURSE_PLACEHOLDERS); + + findReferencedVars(parent_vars, outer, &out_tlist, &out_relids); + findReferencedVars(parent_vars, inner, &in_tlist, &in_relids); + + join_exec_nodes = IsJoinReducible(inner, outer, in_relids, out_relids, + &(nest_parent->join), + best_path, root->parse->rtable); + /* XXX Check if the join optimization is possible */ + if (join_exec_nodes) + { + RemoteQuery *result; + Plan *result_plan; + StringInfoData targets, clauses, scan_clauses, fromlist, join_condition; + StringInfoData squery; + ListCell *l; + char in_alias[15], out_alias[15]; + bool use_where = false; + Index dummy_rtindex; + RangeTblEntry *dummy_rte; + List *local_scan_clauses = NIL, *remote_scan_clauses = NIL; + char *pname; + List *colnames; + + + /* KISS! As long as distinct aliases are provided for all the objects in + * involved in query, remote server should not crib! */ + sprintf(in_alias, "out_%d", root->rs_alias_index); + sprintf(out_alias, "in_%d", root->rs_alias_index); + + /* + * If the JOIN ON clause has a local dependency then we cannot ship + * the join to the remote side at all, bail out immediately. + */ + if (!pgxc_is_expr_shippable((Expr *)nest_parent->join.joinqual, NULL)) + { + elog(DEBUG1, "cannot reduce: local dependencies in the joinqual"); + return parent; + } + + /* + * If the normal plan qual has local dependencies, the join can + * still be shipped. Try harder to ship remote clauses out of the + * entire list. These local quals will become part of the quals + * list of the reduced remote scan node down later. + */ + if (!pgxc_is_expr_shippable((Expr *)nest_parent->join.plan.qual, NULL)) + { + elog(DEBUG1, "local dependencies in the join plan qual"); + + /* + * trawl through each entry and come up with remote and local + * clauses... sigh + */ + foreach(l, nest_parent->join.plan.qual) + { + Node *clause = lfirst(l); + + /* + * if the currentof in the above call to + * clause_is_local_bound is set, somewhere in the list there + * is currentof clause, so keep that information intact and + * pass a dummy argument here. + */ + if (!pgxc_is_expr_shippable((Expr *)clause, NULL)) + local_scan_clauses = lappend(local_scan_clauses, clause); + else + remote_scan_clauses = lappend(remote_scan_clauses, clause); + } + } + else + { + /* + * there is no local bound clause, all the clauses are remote + * scan clauses + */ + remote_scan_clauses = nest_parent->join.plan.qual; + } + + /* generate the tlist for the new RemoteScan node using out_tlist, in_tlist */ + initStringInfo(&targets); + colnames = create_remote_target_list(root, &targets, out_tlist, in_tlist, + out_alias, outer->reduce_level, in_alias, inner->reduce_level); + + /* + * generate the fromlist now. The code has to appropriately mention + * the JOIN type in the string being generated. + */ + initStringInfo(&fromlist); + appendStringInfo(&fromlist, " (%s) %s ", + outer->sql_statement, quote_identifier(out_alias)); + + use_where = false; + switch (nest_parent->join.jointype) + { + case JOIN_INNER: + pname = ", "; + use_where = true; + break; + case JOIN_LEFT: + pname = "LEFT JOIN"; + break; + case JOIN_FULL: + pname = "FULL JOIN"; + break; + case JOIN_RIGHT: + pname = "RIGHT JOIN"; + break; + case JOIN_SEMI: + case JOIN_ANTI: + default: + return parent; + } + + /* + * splendid! we can actually replace this join hierarchy with a + * single RemoteScan node now. Start off by constructing the + * appropriate new tlist and tupdescriptor + */ + result = makeNode(RemoteQuery); + + /* + * Save various information about the inner and the outer plans. We + * may need this information later if more entries are added to it + * as part of the remote expression optimization + */ + result->read_only = true; + result->inner_alias = pstrdup(in_alias); + result->outer_alias = pstrdup(out_alias); + result->inner_reduce_level = inner->reduce_level; + result->outer_reduce_level = outer->reduce_level; + result->inner_relids = in_relids; + result->outer_relids = out_relids; + result->inner_statement = pstrdup(inner->sql_statement); + result->outer_statement = pstrdup(outer->sql_statement); + result->join_condition = NULL; + result->exec_nodes = join_exec_nodes; + result->is_temp = inner->is_temp || outer->is_temp; + + appendStringInfo(&fromlist, " %s (%s) %s", + pname, inner->sql_statement, quote_identifier(in_alias)); + + /* generate join.joinqual remote clause string representation */ + initStringInfo(&clauses); + if (nest_parent->join.joinqual != NIL) + { + create_remote_clause_expr(root, parent, &clauses, + nest_parent->join.joinqual, result); + } + + /* generate join.plan.qual remote clause string representation */ + initStringInfo(&scan_clauses); + if (remote_scan_clauses != NIL) + { + create_remote_clause_expr(root, parent, &scan_clauses, + remote_scan_clauses, result); + } + + /* + * set the base tlist of the involved base relations, useful in + * set_plan_refs later. Additionally the tupledescs should be + * generated using this base_tlist and not the parent targetlist. + * This is because we want to take into account any additional + * column references from the scan clauses too + */ + base_tlist = add_to_flat_tlist(NIL, list_concat(out_tlist, in_tlist)); + + /* + * Create and append the dummy range table entry to the range table. + * Note that this modifies the master copy the caller passed us, otherwise + * e.g EXPLAIN VERBOSE will fail to find the rte the Vars built below refer + * to. + */ + dummy_rte = make_dummy_remote_rte("__REMOTE_JOIN_QUERY__", + makeAlias("__REMOTE_JOIN_QUERY__", colnames)); + root->parse->rtable = lappend(root->parse->rtable, dummy_rte); + dummy_rtindex = list_length(root->parse->rtable); + + result_plan = &result->scan.plan; + + /* Set the join targetlist to the new base_tlist */ + result_plan->targetlist = parent->targetlist; + result_plan->lefttree = NULL; + result_plan->righttree = NULL; + result->scan.scanrelid = dummy_rtindex; + + /* generate the squery for this node */ + + /* NOTE: it's assumed that the remote_paramNums array is + * filled in the same order as we create the query here. + * + * TODO: we need some way to ensure that the remote_paramNums + * is filled in the same order as the order in which the clauses + * are added in the query below. + */ + initStringInfo(&squery); + appendStringInfo(&squery, "SELECT %s FROM %s", targets.data, fromlist.data); + + initStringInfo(&join_condition); + if (clauses.data[0] != '\0') + appendStringInfo(&join_condition, " %s %s", use_where? " WHERE " : " ON ", clauses.data); + + if (scan_clauses.data[0] != '\0') + appendStringInfo(&join_condition, " %s %s", use_where? " AND " : " WHERE ", scan_clauses.data); + + if (join_condition.data[0] != '\0') + appendStringInfoString(&squery, join_condition.data); + + result->sql_statement = squery.data; + result->join_condition = join_condition.data; + /* don't forget to increment the index for the next time around! */ + result->reduce_level = root->rs_alias_index++; + + + /* set_plan_refs needs this later */ + result->base_tlist = base_tlist; + + /* + * if there were any local scan clauses stick them up here. They + * can come from the join node or from remote scan node themselves. + * Because of the processing being done earlier in + * create_remotescan_plan, all of the clauses if present will be + * local ones and hence can be stuck without checking for + * remoteness again here into result_plan->qual + */ + result_plan->qual = list_concat(result_plan->qual, outer_plan->qual); + result_plan->qual = list_concat(result_plan->qual, inner_plan->qual); + result_plan->qual = list_concat(result_plan->qual, local_scan_clauses); + + /* we actually need not worry about costs since this is the final plan */ + result_plan->startup_cost = outer_plan->startup_cost; + result_plan->total_cost = outer_plan->total_cost; + result_plan->plan_rows = outer_plan->plan_rows; + result_plan->plan_width = outer_plan->plan_width; + + return (Plan *)result_plan; + } + } + + return parent; +} + +/* + * Generate aliases for columns of remote tables using the + * colname_varno_varattno_reduce_level nomenclature + */ +static Alias * +generate_remote_rte_alias(RangeTblEntry *rte, int varno, char *aliasname, int reduce_level) +{ + int maxattrs; + int varattno; + List *colnames = NIL; + StringInfo attr = makeStringInfo(); + Relation relation; + + if (rte->rtekind != RTE_RELATION) + elog(ERROR, "called in improper context"); + + relation = heap_open(rte->relid, AccessShareLock); + + maxattrs = RelationGetNumberOfAttributes(relation); + + for (varattno = 0; varattno < maxattrs; varattno++) + { + char *attname = get_rte_attribute_name(rte, varattno + 1); + + if (reduce_level == 0) + { + /* + * Even if reduce level is 0, we still need to copy column aliases + * from rte because we don't want to loose any user-supplied table + * column aliases, in case any. + */ + colnames = lappend(colnames, makeString(pstrdup((attname)))); + } + else + { + resetStringInfo(attr); + appendStringInfo(attr, "%s_%d_%d_%d", + attname, varno, varattno + 1, reduce_level); + colnames = lappend(colnames, makeString(pstrdup(attr->data))); + } + + } + + heap_close(relation, AccessShareLock); + + return makeAlias(aliasname, colnames); +} + +/* create_remote_target_list + * generate a targetlist using out_alias and in_alias appropriately. It is + * possible that in case of multiple-hierarchy reduction, both sides can have + * columns with the same name. E.g. consider the following: + * + * select * from emp e join emp f on e.x = f.x, emp g; + * + * So if we just use new_alias.columnname it can + * very easily clash with other columnname from the same side of an already + * reduced join. To avoid this, we generate unique column aliases using the + * following convention: + * colname_varno_varattno_reduce_level_index + * + * Each RemoteScan node carries it's reduce_level index to indicate the + * convention that should be adopted while referring to it's columns. If the + * level is 0, then normal column names can be used because they will never + * clash at the join level + */ +static List * +create_remote_target_list(PlannerInfo *root, StringInfo targets, List *out_tlist, List *in_tlist, + char *out_alias, int out_index, char *in_alias, int in_index) +{ + int i = 0; + ListCell *l; + StringInfo attrname = makeStringInfo(); + bool add_null_target = true; + List *colnames = NIL; + + foreach(l, out_tlist) + { + Var *var = (Var *) lfirst(l); + RangeTblEntry *rte = planner_rt_fetch(var->varno, root); + char *attname; + + + if (i++ > 0) + appendStringInfo(targets, ", "); + + attname = get_rte_attribute_name(rte, var->varattno); + + if (out_index) + { + resetStringInfo(attrname); + /* varattno can be negative for sys attributes, hence the abs! */ + appendStringInfo(attrname, "%s_%d_%d_%d", + attname, var->varno, abs(var->varattno), out_index); + appendStringInfo(targets, "%s.%s", + quote_identifier(out_alias), quote_identifier(attrname->data)); + } + else + appendStringInfo(targets, "%s.%s", + quote_identifier(out_alias), quote_identifier(attname)); + + /* generate the new alias now using root->rs_alias_index */ + resetStringInfo(attrname); + appendStringInfo(attrname, "%s_%d_%d_%d", + attname, var->varno, abs(var->varattno), root->rs_alias_index); + appendStringInfo(targets, " AS %s", quote_identifier(attrname->data)); + colnames = lappend(colnames, makeString(pstrdup(attrname->data))); + add_null_target = false; + } + + foreach(l, in_tlist) + { + Var *var = (Var *) lfirst(l); + RangeTblEntry *rte = planner_rt_fetch(var->varno, root); + char *attname; + + if (i++ > 0) + appendStringInfo(targets, ", "); + + attname = get_rte_attribute_name(rte, var->varattno); + + if (in_index) + { + resetStringInfo(attrname); + /* varattno can be negative for sys attributes, hence the abs! */ + appendStringInfo(attrname, "%s_%d_%d_%d", + attname, var->varno, abs(var->varattno), in_index); + appendStringInfo(targets, "%s.%s", + quote_identifier(in_alias), quote_identifier(attrname->data)); + } + else + appendStringInfo(targets, "%s.%s", + quote_identifier(in_alias), quote_identifier(attname)); + + /* generate the new alias now using root->rs_alias_index */ + resetStringInfo(attrname); + appendStringInfo(attrname, "%s_%d_%d_%d", + attname, var->varno, abs(var->varattno), root->rs_alias_index); + appendStringInfo(targets, " AS %s", quote_identifier(attrname->data)); + colnames = lappend(colnames, makeString(pstrdup(attrname->data))); + add_null_target = false; + } + + /* + * It's possible that in some cases, the targetlist might not refer to any + * vars from the joined relations, eg. + * select count(*) from t1, t2; select const from t1, t2; etc + * For such cases just add a NULL selection into this targetlist + */ + if (add_null_target) + appendStringInfo(targets, " NULL "); + return colnames; +} + +/* + * create_remote_clause_expr + * generate a string to represent the clause list expression using out_alias + * and in_alias references. This function does a cute hack by temporarily + * modifying the rte->eref entries of the involved relations to point to + * out_alias and in_alias appropriately. The deparse_expression call then + * generates a string using these erefs which is exactly what is desired here. + * + * Additionally it creates aliases for the column references based on the + * reduce_level values too. This handles the case when both sides have same + * named columns.. + * + * Obviously this function restores the eref, alias values to their former selves + * appropriately too, after use + */ +static void +create_remote_clause_expr(PlannerInfo *root, Plan *parent, StringInfo clauses, + List *qual, RemoteQuery *scan) +{ + Node *node = (Node *) make_ands_explicit(qual); + + return create_remote_expr(root, parent, clauses, node, scan); +} + +static void +create_remote_expr(PlannerInfo *root, Plan *parent, StringInfo expr, + Node *node, RemoteQuery *scan) +{ + List *context; + List *leref = NIL; + ListCell *cell; + char *exprstr; + int rtindex; + Relids tmprelids, relids; + + relids = pull_varnos((Node *)node); + + tmprelids = bms_copy(relids); + + while ((rtindex = bms_first_member(tmprelids)) >= 0) + { + RangeTblEntry *rte = planner_rt_fetch(rtindex, root); + + /* + * This rtindex should be a member of either out_relids or + * in_relids and never both + */ + if (bms_is_member(rtindex, scan->outer_relids) && + bms_is_member(rtindex, scan->inner_relids)) + elog(ERROR, "improper relid references in the join clause list"); + + /* + * save the current rte->eref and rte->alias values and stick in a new + * one in the rte with the proper inner or outer alias + */ + leref = lappend(leref, rte->eref); + leref = lappend(leref, rte->alias); + + if (bms_is_member(rtindex, scan->outer_relids)) + { + rte->eref = makeAlias(scan->outer_alias, NIL); + + /* attach proper column aliases.. */ + rte->alias = generate_remote_rte_alias(rte, rtindex, + scan->outer_alias, scan->outer_reduce_level); + } + if (bms_is_member(rtindex, scan->inner_relids)) + { + rte->eref = makeAlias(scan->inner_alias, NIL); + + /* attach proper column aliases.. */ + rte->alias = generate_remote_rte_alias(rte, rtindex, + scan->inner_alias, scan->inner_reduce_level); + } + } + bms_free(tmprelids); + + /* Set up deparsing context */ + context = deparse_context_for_plan((Node *) parent, + NULL, + root->parse->rtable); + + exprstr = deparse_expression(node, context, true, false); + + /* revert back the saved eref entries in the same order now! */ + cell = list_head(leref); + tmprelids = bms_copy(relids); + while ((rtindex = bms_first_member(tmprelids)) >= 0) + { + RangeTblEntry *rte = planner_rt_fetch(rtindex, root); + + Assert(cell != NULL); + + rte->eref = lfirst(cell); + cell = lnext(cell); + + rte->alias = lfirst(cell); + cell = lnext(cell); + } + bms_free(tmprelids); + + appendStringInfo(expr, " %s", exprstr); + return; +} +#endif /* XCP */ +#endif /* PGXC */ + /* * create_append_plan * Create an Append plan for 'best_path' and (recursively) plans @@ -924,6 +1663,14 @@ create_unique_plan(PlannerInfo *root, UniquePath *best_path) subplan = (Plan *) make_result(root, newtlist, NULL, subplan); else subplan->targetlist = newtlist; +#ifdef XCP + /* + * RemoteSubplan is conditionally projection capable - it is pushing + * projection to the data nodes + */ + if (IsA(subplan, RemoteSubplan)) + subplan->lefttree->targetlist = newtlist; +#endif } /* @@ -1045,6 +1792,140 @@ create_unique_plan(PlannerInfo *root, UniquePath *best_path) } +#ifdef XCP +/* + * adjustSubplanDistribution + * Make sure the distribution of the subplan is matching to the consumers. + */ +static void +adjustSubplanDistribution(PlannerInfo *root, Distribution *pathd, + Distribution *subd) +{ + /* Replace path restriction with actual */ + if (pathd && !bms_is_empty(root->curOuterRestrict)) + { + bms_free(pathd->restrictNodes); + pathd->restrictNodes = bms_copy(root->curOuterRestrict); + } + + root->curOuterRestrict = NULL; + + /* + * Set new restriction for the subpath + * Do not restrict if distributions are equal, they are going to be merged + * and subplan will be executed on caller nodes. + * However if there are upper query levels caller's distribution may be + * adjusted. + */ + if (subd && !equal(subd, pathd)) + { + /* + * If subpath is replicated without restriction choose one execution + * datanode and set it as current restriction. + */ + if (IsLocatorReplicated(subd->distributionType) && + bms_num_members(subd->restrictNodes) != 1) + { + Bitmapset *result = NULL; + Bitmapset *execute; + Bitmapset *common; + int node; + + /* + * We should choose one of the distribution nodes, but we can save + * some network traffic if chosen execution node will be one of + * the result nodes at the same time. + */ + if (pathd) + result = bms_is_empty(pathd->restrictNodes) ? + pathd->nodes : pathd->restrictNodes; + execute = bms_is_empty(subd->restrictNodes) ? + subd->nodes : subd->restrictNodes; + common = bms_intersect(result, execute); + if (bms_is_empty(common)) + { + bms_free(common); + common = bms_copy(subd->nodes); + } + + /* + * Check if any of the common nodes is preferred and choose one + * of the preferred + */ + node = GetAnyDataNode(common); + bms_free(common); + + /* set restriction for the subplan */ + root->curOuterRestrict = bms_make_singleton(node); + + /* replace execution restriction for the generated */ + bms_free(subd->restrictNodes); + subd->restrictNodes = bms_make_singleton(node); + } + } +} + +/* + * create_remotescan_plan + * Create a RemoteSubquery plan for 'best_path' and (recursively) plans + * for its subpaths. + * + * Returns a Plan node. + */ +static RemoteSubplan * +create_remotescan_plan(PlannerInfo *root, + RemoteSubPath *best_path) +{ + RemoteSubplan *plan; + Plan *subplan; + Bitmapset *saverestrict; + + /* + * Subsequent code will modify current restriction, it needs to be restored + * so other path nodes in the outer tree could see correct value. + */ + saverestrict = root->curOuterRestrict; + + adjustSubplanDistribution(root, best_path->path.distribution, + best_path->subpath->distribution); + + subplan = create_plan_recurse(root, best_path->subpath); + + /* We don't want any excess columns in the remote tuples */ + disuse_physical_tlist(subplan, best_path->subpath); + + plan = make_remotesubplan(root, subplan, + best_path->path.distribution, + best_path->subpath->distribution, + best_path->path.pathkeys); + + copy_path_costsize(&plan->scan.plan, (Path *) best_path); + + /* restore current restrict */ + bms_free(root->curOuterRestrict); + root->curOuterRestrict = saverestrict; + + return plan; +} + + +RemoteSubplan * +find_push_down_plan(Plan *plan, bool force) +{ + if (IsA(plan, RemoteSubplan) && + (force || (list_length(((RemoteSubplan *) plan)->nodeList) > 1 && + ((RemoteSubplan *) plan)->execOnAll))) + return (RemoteSubplan *) plan; + if (IsA(plan, Hash) || + IsA(plan, Material) || + IsA(plan, Unique) || + IsA(plan, Limit)) + return find_push_down_plan(plan->lefttree, force); + return NULL; +} +#endif + + /***************************************************************************** * * BASE-RELATION SCAN METHODS @@ -1862,6 +2743,335 @@ create_worktablescan_plan(PlannerInfo *root, Path *best_path, return scan_plan; } + +#ifdef PGXC +#ifndef XCP +/* + * mk_row_mark_clause + * Given a PlanRowMark, create a corresponding RowMarkClause + */ +static RowMarkClause * +mk_row_mark_clause(PlanRowMark *prm) +{ + RowMarkClause *rmc; + + if (prm == NULL) + return NULL; + + /* We are intrested in either FOR UPDATE or FOR SHARE */ + if (prm->markType != ROW_MARK_EXCLUSIVE && prm->markType != ROW_MARK_SHARE) + return NULL; + + rmc = makeNode(RowMarkClause); + + /* Copy rti as is form the PlanRowMark */ + rmc->rti = prm->rti; + + /* Assume FOR SHARE unless compelled FOR UPDATE */ + rmc->forUpdate = false; + if (prm->markType == ROW_MARK_EXCLUSIVE) + rmc->forUpdate = true; + + /* Copy noWait as is form the PlanRowMark */ + rmc->noWait = prm->noWait; + + /* true or false does not matter since we will use the result only while deparsing */ + rmc->pushedDown = false; + + return rmc; +} + +/* + * compare_alias + * Compare two aliases + */ +static bool +compare_alias(Alias *a1, Alias *a2) +{ + if (a1 == NULL && a2 == NULL) + return true; + + if (a1 == NULL && a2 != NULL) + return false; + + if (a2 == NULL && a1 != NULL) + return false; + + if (strcmp(a1->aliasname, a2->aliasname) == 0) + return true; + + return false; +} + +/* + * contains_only_vars(tlist) + * Return true only if each element of tlist is a target entry having Var node + * as its containing expression. + */ +static bool +contains_only_vars(List *tlist) +{ + ListCell *l; + + foreach(l, (List *) tlist) + { + Node *tle = lfirst(l); + if (nodeTag(tle) != T_TargetEntry) + return false; + else + { + Expr *expr = ((TargetEntry *) tle)->expr; + if (nodeTag(expr) != T_Var) + return false; + } + } + return true; +} + +/* + * create_remotequery_plan + * Returns a remotequery plan for the base relation scanned by 'best_path' + * with restriction clauses 'scan_clauses' and targetlist 'tlist'. + */ +static Plan * +create_remotequery_plan(PlannerInfo *root, Path *best_path, + List *tlist, List *scan_clauses) +{ + RemoteQuery *scan_plan; + Index scan_relid = best_path->parent->relid; + RangeTblEntry *rte; + List *remote_scan_clauses = NIL; + List *local_scan_clauses = NIL; + StringInfoData sql; + Query *query; + RangeTblRef *rtr; + List *varlist; + ListCell *varcell; + Node *tmp_node; + List *rmlist; + List *tvarlist; + bool tlist_is_simple; + List *base_tlist; /* the target list representing the + * result obtained from datanode + */ + RangeTblEntry *dummy_rte; /* RTE for the remote query node being + * added. + */ + Index dummy_rtindex; + + Assert(scan_relid > 0); + Assert(best_path->parent->rtekind == RTE_RELATION); + + /* Sort clauses into best execution order */ + scan_clauses = order_qual_clauses(root, scan_clauses); + /* Reduce RestrictInfo list to bare expressions; ignore pseudoconstants */ + scan_clauses = extract_actual_clauses(scan_clauses, false); + + if (scan_clauses) + { + ListCell *l; + + foreach(l, (List *)scan_clauses) + { + Node *clause = lfirst(l); + + if (pgxc_is_expr_shippable((Expr *)clause, NULL)) + remote_scan_clauses = lappend(remote_scan_clauses, clause); + else + local_scan_clauses = lappend(local_scan_clauses, clause); + } + } + + /* + * The target list passed in may not contain the Vars required for + * evaluating the quals. Add those quals in the targetlist + */ + tlist = add_to_flat_tlist(tlist, copyObject(pull_var_clause((Node *)local_scan_clauses, + PVC_RECURSE_AGGREGATES, + PVC_RECURSE_PLACEHOLDERS))); + tlist_is_simple = contains_only_vars(tlist); + + /* + * Construct a Query structure for the query to be fired on the Datanodes + * and deparse it. Fields not set remain memzero'ed as set by makeNode. + */ + rte = rt_fetch(scan_relid, root->parse->rtable); + Assert(rte->rtekind == RTE_RELATION); + /* Make a copy of RTE to be included in the new query structure */ + rte = copyObject(rte); + /* This RTE should appear in FROM clause of the SQL statement constructed */ + rte->inFromCl = true; + + query = makeNode(Query); + query->commandType = CMD_SELECT; + query->rtable = list_make1(rte); + query->jointree = makeNode(FromExpr); + + rtr = makeNode(RangeTblRef); + rtr->rtindex = list_length(query->rtable); + /* There can be only one table */ + Assert(rtr->rtindex == 1); + + query->jointree->fromlist = list_make1(rtr); + query->jointree->quals = (Node *)make_ands_explicit(copyObject(remote_scan_clauses)); + + /* + * RemoteQuery node cannot handle arbitrary expressions in the target list. + * So if the target list has any elements that are not plain Vars, we need + * to create a Result node above RemoteQuery, and assign a plain var tlist + * in RemoteQuery node, and Result node will handle the expressions. So if + * the passed-in tlist is not a simple vars tlist, derive one out of the + * tlist. + */ + if (tlist_is_simple) + query->targetList = copyObject(tlist); + else + { + tvarlist = copyObject(pull_var_clause((Node *)tlist, + PVC_RECURSE_AGGREGATES, + PVC_RECURSE_PLACEHOLDERS)); + query->targetList = add_to_flat_tlist(NIL, copyObject(tvarlist)); + } + + /* + * We are going to change the Var nodes in the target list to be sent to the + * datanode. We need the original tlist to establish the mapping of result + * obtained from the datanode in this plan. It will be saved in + * RemoteQuery->base_tlist. So, copy the target list before modifying it + */ + base_tlist = copyObject(query->targetList); + + /* + * Change the varno in Var nodes in the targetlist of the query to be shipped to the + * Datanode to 1, to match the rtable in the query. Do the same for Var + * nodes in quals. + */ + varlist = list_concat(pull_var_clause((Node *)query->targetList, + PVC_RECURSE_AGGREGATES, + PVC_RECURSE_PLACEHOLDERS), + pull_var_clause((Node *)query->jointree->quals, + PVC_RECURSE_AGGREGATES, + PVC_RECURSE_PLACEHOLDERS)); + + foreach(varcell, varlist) + { + Var *var = lfirst(varcell); + if (var->varno != scan_relid) + elog(ERROR, "Single table scan can not handle vars from more than one relation"); + var->varno = rtr->rtindex; + } + list_free(varlist); + + /* + * Call fix_scan_expr to fix the PlaceHolderVars. This step is not needed if + * we construct the query at the time of execution. + */ + tmp_node = pgxc_fix_scan_expr(root, (Node *)query->targetList, 0); + Assert(!tmp_node || IsA(tmp_node, List)); + query->targetList = (List *)tmp_node; + tmp_node = pgxc_fix_scan_expr(root, (Node *)query->jointree->quals, 0); + query->jointree->quals = tmp_node; + + /* + * Before deparsing the query we need to check whether there are any FOR UPDATE/SHARE clauses + * in the query that we need to propagate to Datanodes + */ + rmlist = NULL; + if (root->xc_rowMarks != NULL) + { + ListCell *rmcell; + + foreach(rmcell, root->xc_rowMarks) + { + PlanRowMark *prm = lfirst(rmcell); + RangeTblEntry *rte_in_rm; + + /* + * One remote query node contains one table only, check to make sure that + * this row mark clause is referring to the same table that this remote + * query node is targeting. + */ + rte_in_rm = rt_fetch(prm->rti, root->parse->rtable); + if (rte_in_rm->relid == rte->relid && compare_alias(rte->alias, rte_in_rm->alias)) + { + RowMarkClause *rmc; + + /* + * Change the range table index in the row mark clause to 1 + * to match the rtable in the query + */ + prm->rti = 1; + + /* Come up with a Row Mark Clause given a Plan Row Mark */ + rmc = mk_row_mark_clause(prm); + + if (rmc != NULL) + { + /* Add this row mark clause to the list to be added in the query to deparse */ + rmlist = lappend(rmlist, rmc); + + /* + * Although we can have mutiple row mark clauses even for a single table + * but here we will have only one plan row mark clause per table + * The reason is that here we are talking about only FOR UPDATE & FOR SHARE + * If we have both FOR SHARE and FOR UPDATE mentioned for the same table + * FOR UPDATE takes priority over FOR SHARE and in effect we will have only one clause. + */ + break; + } + } + } + + /* copy the row mark clause list in the query to deparse */ + query->rowMarks = rmlist; + + /* If there is a row mark clause, set the flag for deprasing of the row mark clause */ + if (rmlist != NULL) + query->hasForUpdate = true; + } + initStringInfo(&sql); + deparse_query(query, &sql, NIL); + + if (rmlist != NULL) + list_free_deep(rmlist); + + /* + * Create and append the dummy range table entry to the range table. + * Note that this modifies the master copy the caller passed us, otherwise + * e.g EXPLAIN VERBOSE will fail to find the rte the Vars built below refer + * to. + */ + dummy_rte = make_dummy_remote_rte(get_rel_name(rte->relid), + makeAlias("_REMOTE_TABLE_QUERY_", NIL)); + root->parse->rtable = lappend(root->parse->rtable, dummy_rte); + dummy_rtindex = list_length(root->parse->rtable); + + scan_plan = make_remotequery(tlist, local_scan_clauses, dummy_rtindex); + + /* Track if the remote query involves a temporary object */ + scan_plan->is_temp = IsTempTable(rte->relid); + scan_plan->read_only = (query->commandType == CMD_SELECT && !query->hasForUpdate); + scan_plan->has_row_marks = query->hasForUpdate; + scan_plan->sql_statement = sql.data; + scan_plan->base_tlist = base_tlist; + scan_plan->exec_nodes = GetRelationNodesByQuals(rte->relid, rtr->rtindex, + query->jointree->quals, + RELATION_ACCESS_READ); + if (!scan_plan->exec_nodes) + elog(ERROR, "No distribution information found for relid %d", rte->relid); + + copy_path_costsize(&scan_plan->scan.plan, best_path); + + /* PGXCTODO - get better estimates */ + scan_plan->scan.plan.plan_rows = 1000; + + scan_plan->has_ins_child_sel_parent = root->parse->is_ins_child_sel_parent; + + return (Plan *)scan_plan; +} +#endif /* XCP */ +#endif /* PGXC */ + /* * create_foreignscan_plan * Returns a foreignscan plan for the base relation scanned by 'best_path' @@ -2019,6 +3229,27 @@ create_nestloop_plan(PlannerInfo *root, else prev = cell; } +#ifdef XCP + /* + * While NestLoop is executed it rescans inner plan. We do not want to + * rescan RemoteSubplan and do not support it. + * So if inner_plan is a RemoteSubplan, materialize it. + */ + if (IsA(inner_plan, RemoteSubplan)) + { + Plan *matplan = (Plan *) make_material(inner_plan); + + /* + * We assume the materialize will not spill to disk, and therefore + * charge just cpu_operator_cost per tuple. (Keep this estimate in + * sync with cost_mergejoin.) + */ + copy_plan_costsize(matplan, inner_plan); + matplan->total_cost += cpu_operator_cost * matplan->plan_rows; + + inner_plan = matplan; + } +#endif join_plan = make_nestloop(tlist, joinclauses, @@ -3286,6 +4517,382 @@ make_worktablescan(List *qptlist, return node; } + +#ifdef PGXC +#ifndef XCP +static RemoteQuery * +make_remotequery(List *qptlist, List *qpqual, Index scanrelid) +{ + RemoteQuery *node = makeNode(RemoteQuery); + Plan *plan = &node->scan.plan; + + /* cost should be inserted by caller */ + plan->targetlist = qptlist; + plan->qual = qpqual; + plan->lefttree = NULL; + plan->righttree = NULL; + node->scan.scanrelid = scanrelid; + node->read_only = true; + node->has_row_marks = false; + + return node; +} +#endif /* XCP */ +#endif /* PGXC */ + + +#ifdef XCP +/* + * make_remotesubplan + * Create a RemoteSubplan node to execute subplan on remote nodes. + * leftree - the subplan which we want to push down to remote node. + * resultDistribution - the distribution of the remote result. May be NULL - + * results are coming to the invoking node + * execDistribution - determines how source data of the subplan are + * distributed, where we should send the subplan and how combine results. + * pathkeys - the remote subplan is sorted according to these keys, executor + * should perform merge sort of incoming tuples + */ +RemoteSubplan * +make_remotesubplan(PlannerInfo *root, + Plan *lefttree, + Distribution *resultDistribution, + Distribution *execDistribution, + List *pathkeys) +{ + RemoteSubplan *node = makeNode(RemoteSubplan); + Plan *plan = &node->scan.plan; + Bitmapset *tmpset; + int nodenum; + + /* Sanity checks */ + Assert(!equal(resultDistribution, execDistribution)); + Assert(!IsA(lefttree, RemoteSubplan)); + + if (resultDistribution) + { + node->distributionType = resultDistribution->distributionType; + node->distributionKey = InvalidAttrNumber; + if (resultDistribution->distributionExpr) + { + ListCell *lc; + Expr *expr; + + /* XXX Is that correct to reference a column of different type? */ + if (IsA(resultDistribution->distributionExpr, RelabelType)) + expr = ((RelabelType *) resultDistribution->distributionExpr)->arg; + else + expr = (Expr *) resultDistribution->distributionExpr; + + /* Find distribution expression in the target list */ + foreach(lc, lefttree->targetlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(lc); + + if (equal(tle->expr, expr)) + { + node->distributionKey = tle->resno; + break; + } + } + + if (node->distributionKey == InvalidAttrNumber) + { + TargetEntry *newtle; + + /* The expression is not found, need to add junk */ + newtle = makeTargetEntry(expr, + list_length(lefttree->targetlist) + 1, + NULL, + true); + + if (is_projection_capable_plan(lefttree)) + { + /* Ok to modify subplan's target list */ + lefttree->targetlist = lappend(lefttree->targetlist, newtle); + } + else + { + /* Use Result node to calculate expression */ + List *newtlist = list_copy(lefttree->targetlist); + newtlist = lappend(newtlist, newtle); + lefttree = (Plan *) make_result(root, newtlist, NULL, lefttree); + } + + node->distributionKey = newtle->resno; + } + } + /* + * The distributionNodes describes result distribution + */ + tmpset = bms_copy(resultDistribution->nodes); + node->distributionNodes = NIL; + while ((nodenum = bms_first_member(tmpset)) >= 0) + node->distributionNodes = lappend_int(node->distributionNodes, + nodenum); + bms_free(tmpset); + /* + * The distributionRestrict defines the set of nodes where results are + * actually shipped. These are the nodes where upper level step + * is executed. + */ + if (resultDistribution->restrictNodes) + { + tmpset = bms_copy(resultDistribution->restrictNodes); + node->distributionRestrict = NIL; + while ((nodenum = bms_first_member(tmpset)) >= 0) + node->distributionRestrict = + lappend_int(node->distributionRestrict, nodenum); + bms_free(tmpset); + } + else + node->distributionRestrict = list_copy(node->distributionNodes); + } + else + { + node->distributionType = LOCATOR_TYPE_NONE; + node->distributionKey = InvalidAttrNumber; + node->distributionNodes = NIL; + } + plan->qual = NIL; + plan->lefttree = lefttree; + plan->righttree = NULL; + copy_plan_costsize(plan, lefttree); + /* determine where subplan will be executed */ + if (execDistribution) + { + if (execDistribution->restrictNodes) + tmpset = bms_copy(execDistribution->restrictNodes); + else + tmpset = bms_copy(execDistribution->nodes); + node->nodeList = NIL; + while ((nodenum = bms_first_member(tmpset)) >= 0) + node->nodeList = lappend_int(node->nodeList, nodenum); + bms_free(tmpset); + node->execOnAll = list_length(node->nodeList) == 1 || + !IsLocatorReplicated(execDistribution->distributionType); + } + else + { + /* + * Prepare single execution of replicated subplan. Choose one node from + * the execution node list, preferrably the node is also a member of + * the list of result nodes, so later all node executors contact the + * same node to get tuples + */ + tmpset = NULL; + if (!bms_is_empty(resultDistribution->restrictNodes)) + tmpset = bms_copy(resultDistribution->restrictNodes); + else + tmpset = bms_copy(resultDistribution->nodes); + /* + * If result goes on single node execute subplan locally + */ + if (bms_num_members(tmpset) > 1) + { + /* get one execution node TODO: load balancing */ + nodenum = bms_first_member(tmpset); + node->nodeList = list_make1_int(nodenum); + node->execOnAll = true; + } + else + { + node->nodeList = NIL; + node->execOnAll = false; + } + bms_free(tmpset); + } + plan->targetlist = lefttree->targetlist; + /* We do not need to merge sort if only one node is yielding tuples */ + if (pathkeys && node->execOnAll && list_length(node->nodeList) > 1) + { + List *tlist = lefttree->targetlist; + ListCell *i; + int numsortkeys; + AttrNumber *sortColIdx; + Oid *sortOperators; + Oid *collations; + bool *nullsFirst; + + /* + * We will need at most list_length(pathkeys) sort columns; possibly less + */ + numsortkeys = list_length(pathkeys); + sortColIdx = (AttrNumber *) palloc(numsortkeys * sizeof(AttrNumber)); + sortOperators = (Oid *) palloc(numsortkeys * sizeof(Oid)); + collations = (Oid *) palloc(numsortkeys * sizeof(Oid)); + nullsFirst = (bool *) palloc(numsortkeys * sizeof(bool)); + + numsortkeys = 0; + + foreach(i, pathkeys) + { + PathKey *pathkey = (PathKey *) lfirst(i); + EquivalenceClass *ec = pathkey->pk_eclass; + TargetEntry *tle = NULL; + Oid pk_datatype = InvalidOid; + Oid sortop; + ListCell *j; + + if (ec->ec_has_volatile) + { + /* + * If the pathkey's EquivalenceClass is volatile, then it must + * have come from an ORDER BY clause, and we have to match it to + * that same targetlist entry. + */ + if (ec->ec_sortref == 0) /* can't happen */ + elog(ERROR, "volatile EquivalenceClass has no sortref"); + tle = get_sortgroupref_tle(ec->ec_sortref, tlist); + Assert(tle); + Assert(list_length(ec->ec_members) == 1); + pk_datatype = ((EquivalenceMember *) linitial(ec->ec_members))->em_datatype; + } + else + { + /* + * Otherwise, we can sort by any non-constant expression listed in + * the pathkey's EquivalenceClass. For now, we take the first one + * that corresponds to an available item in the tlist. If there + * isn't any, use the first one that is an expression in the + * input's vars. (The non-const restriction only matters if the + * EC is below_outer_join; but if it isn't, it won't contain + * consts anyway, else we'd have discarded the pathkey as + * redundant.) + * + * XXX if we have a choice, is there any way of figuring out which + * might be cheapest to execute? (For example, int4lt is likely + * much cheaper to execute than numericlt, but both might appear + * in the same equivalence class...) Not clear that we ever will + * have an interesting choice in practice, so it may not matter. + */ + foreach(j, ec->ec_members) + { + EquivalenceMember *em = (EquivalenceMember *) lfirst(j); + + if (em->em_is_const) + continue; + + tle = tlist_member((Node *) em->em_expr, tlist); + if (tle) + { + pk_datatype = em->em_datatype; + break; /* found expr already in tlist */ + } + + /* + * We can also use it if the pathkey expression is a relabel + * of the tlist entry, or vice versa. This is needed for + * binary-compatible cases (cf. make_pathkey_from_sortinfo). + * We prefer an exact match, though, so we do the basic search + * first. + */ + tle = tlist_member_ignore_relabel((Node *) em->em_expr, tlist); + if (tle) + { + pk_datatype = em->em_datatype; + break; /* found expr already in tlist */ + } + } + + if (!tle) + { + /* No matching tlist item; look for a computable expression */ + Expr *sortexpr = NULL; + + foreach(j, ec->ec_members) + { + EquivalenceMember *em = (EquivalenceMember *) lfirst(j); + List *exprvars; + ListCell *k; + + if (em->em_is_const) + continue; + sortexpr = em->em_expr; + exprvars = pull_var_clause((Node *) sortexpr, + PVC_INCLUDE_AGGREGATES, + PVC_INCLUDE_PLACEHOLDERS); + foreach(k, exprvars) + { + if (!tlist_member_ignore_relabel(lfirst(k), tlist)) + break; + } + list_free(exprvars); + if (!k) + { + pk_datatype = em->em_datatype; + break; /* found usable expression */ + } + } + if (!j) + elog(ERROR, "could not find pathkey item to sort"); + + /* + * Do we need to insert a Result node? + */ + if (!is_projection_capable_plan(lefttree)) + { + /* copy needed so we don't modify input's tlist below */ + tlist = copyObject(tlist); + lefttree = (Plan *) make_result(root, tlist, NULL, + lefttree); + } + + /* + * Add resjunk entry to input's tlist + */ + tle = makeTargetEntry(sortexpr, + list_length(tlist) + 1, + NULL, + true); + tlist = lappend(tlist, tle); + lefttree->targetlist = tlist; /* just in case NIL before */ + } + } + + /* + * Look up the correct sort operator from the PathKey's slightly + * abstracted representation. + */ + sortop = get_opfamily_member(pathkey->pk_opfamily, + pk_datatype, + pk_datatype, + pathkey->pk_strategy); + if (!OidIsValid(sortop)) /* should not happen */ + elog(ERROR, "could not find member %d(%u,%u) of opfamily %u", + pathkey->pk_strategy, pk_datatype, pk_datatype, + pathkey->pk_opfamily); + + /* + * The column might already be selected as a sort key, if the pathkeys + * contain duplicate entries. (This can happen in scenarios where + * multiple mergejoinable clauses mention the same var, for example.) + * So enter it only once in the sort arrays. + */ + numsortkeys = add_sort_column(tle->resno, + sortop, + pathkey->pk_eclass->ec_collation, + pathkey->pk_nulls_first, + numsortkeys, + sortColIdx, sortOperators, + collations, nullsFirst); + } + Assert(numsortkeys > 0); + + node->sort = makeNode(SimpleSort); + node->sort->numCols = numsortkeys; + node->sort->sortColIdx = sortColIdx; + node->sort->sortOperators = sortOperators; + node->sort->sortCollations = collations; + node->sort->nullsFirst = nullsFirst; + } + node->cursor = get_internal_cursor(); + node->unique = 0; + return node; +} +#endif /* XCP */ + + ForeignScan * make_foreignscan(List *qptlist, List *qpqual, @@ -3573,6 +5180,9 @@ make_sort(PlannerInfo *root, Plan *lefttree, int numCols, Sort *node = makeNode(Sort); Plan *plan = &node->plan; Path sort_path; /* dummy for result of cost_sort */ +#ifdef XCP + RemoteSubplan *pushdown; +#endif copy_plan_costsize(plan, lefttree); /* only care about copying size */ cost_sort(&sort_path, root, NIL, @@ -3594,10 +5204,138 @@ make_sort(PlannerInfo *root, Plan *lefttree, int numCols, node->collations = collations; node->nullsFirst = nullsFirst; +#ifdef XCP + /* + * It does not makes sence to sort on one data node and then perform + * one-tape merge sort. So do not push sort down if there is single + * remote data node + */ + pushdown = find_push_down_plan(lefttree, false); + if (pushdown) + { + /* If we already sort results, need to prepend new keys to existing */ + /* + * It is not safe to share colum information. + * If another node will be pushed down the same RemoteSubplan column + * indexes may be modified and this would affect the Sort node + */ + AttrNumber *newSortColIdx; + Oid *newSortOperators; + Oid *newCollations; + bool *newNullsFirst; + int newNumCols; + int i, j; + + /* + * Insert new sort node immediately below the pushdown plan + */ + plan->lefttree = pushdown->scan.plan.lefttree; + pushdown->scan.plan.lefttree = plan; + + newNumCols = numCols + (pushdown->sort ? pushdown->sort->numCols : 0); + newSortColIdx = (AttrNumber *) palloc(newNumCols * sizeof(AttrNumber)); + newSortOperators = (Oid *) palloc(newNumCols * sizeof(Oid)); + newCollations = (Oid *) palloc(newNumCols * sizeof(Oid)); + newNullsFirst = (bool *) palloc(newNumCols * sizeof(bool)); + + /* Copy sort columns */ + for (i = 0; i < numCols; i++) + { + newSortColIdx[i] = sortColIdx[i]; + newSortOperators[i] = sortOperators[i]; + newCollations[i] = collations[i]; + newNullsFirst[i] = nullsFirst[i]; + } + + newNumCols = numCols; + if (pushdown->sort) + { + /* Continue and copy old keys of the subplan which is now under the + * sort */ + for (j = 0; j < pushdown->sort->numCols; j++) + newNumCols = add_sort_column(pushdown->sort->sortColIdx[j], + pushdown->sort->sortOperators[j], + pushdown->sort->sortCollations[j], + pushdown->sort->nullsFirst[j], + newNumCols, + newSortColIdx, + newSortOperators, + newCollations, + newNullsFirst); + } + else + { + /* Create simple sort object if does not exist */ + pushdown->sort = makeNode(SimpleSort); + } + + pushdown->sort->numCols = newNumCols; + pushdown->sort->sortColIdx = newSortColIdx; + pushdown->sort->sortOperators = newSortOperators; + pushdown->sort->sortCollations = newCollations; + pushdown->sort->nullsFirst = newNullsFirst; + + /* + * lefttree is not actually a Sort, but we hope it is not important and + * the result will be used as a generic Plan node. + */ + return (Sort *) lefttree; + } +#endif return node; } /* + * add_sort_column --- utility subroutine for building sort info arrays + * + * We need this routine because the same column might be selected more than + * once as a sort key column; if so, the extra mentions are redundant. + * + * Caller is assumed to have allocated the arrays large enough for the + * max possible number of columns. Return value is the new column count. + */ +static int +add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll, bool nulls_first, + int numCols, AttrNumber *sortColIdx, + Oid *sortOperators, Oid *collations, bool *nullsFirst) +{ + int i; + + Assert(OidIsValid(sortOp)); + + for (i = 0; i < numCols; i++) + { + /* + * Note: we check sortOp because it's conceivable that "ORDER BY foo + * USING <, foo USING <<<" is not redundant, if <<< distinguishes + * values that < considers equal. We need not check nulls_first + * however because a lower-order column with the same sortop but + * opposite nulls direction is redundant. + * + * We could probably consider sort keys with the same sortop and + * different collations to be redundant too, but for the moment treat + * them as not redundant. This will be needed if we ever support + * collations with different notions of equality. + */ + if (sortColIdx[i] == colIdx && + sortOperators[i] == sortOp && + collations[i] == coll) + { + /* Already sorting by this col, so extra sort key is useless */ + return numCols; + } + } + + /* Add the column */ + sortColIdx[numCols] = colIdx; + sortOperators[numCols] = sortOp; + collations[numCols] = coll; + nullsFirst[numCols] = nulls_first; + return numCols + 1; +} + + +/* * prepare_sort_from_pathkeys * Prepare to sort according to given pathkeys * @@ -3823,6 +5561,14 @@ prepare_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys, true); tlist = lappend(tlist, tle); lefttree->targetlist = tlist; /* just in case NIL before */ +#ifdef XCP + /* + * RemoteSubplan is conditionally projection capable - it is + * pushing projection to the data nodes + */ + if (IsA(lefttree, RemoteSubplan)) + lefttree->lefttree->targetlist = tlist; +#endif } /* @@ -4088,6 +5834,137 @@ materialize_finished_plan(Plan *subplan) return matplan; } + +#ifdef XCP +typedef struct +{ + List *subtlist; + List *newtlist; +} find_referenced_cols_context; + +static bool +find_referenced_cols_walker(Node *node, find_referenced_cols_context *context) +{ + TargetEntry *tle; + + if (node == NULL) + return false; + if (IsA(node, Aggref)) + { + /* + * We can not push down aggregates with DISTINCT. + */ + if (((Aggref *) node)->aggdistinct) + return true; + + /* + * We need to add aggregate reference to the new tlist if it + * is not already there. Phase 1 aggregate is actually returns values + * of transition data type, so we should change the data type of the + * expression. + */ + if (!tlist_member(node, context->newtlist)) + { + Aggref *aggref = (Aggref *) node; + Aggref *newagg; + TargetEntry *newtle; + HeapTuple aggTuple; + Form_pg_aggregate aggform; + Oid aggtranstype; + Oid aggcollecttype; + + aggTuple = SearchSysCache1(AGGFNOID, + ObjectIdGetDatum(aggref->aggfnoid)); + if (!HeapTupleIsValid(aggTuple)) + elog(ERROR, "cache lookup failed for aggregate %u", + aggref->aggfnoid); + aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple); + aggtranstype = aggform->aggtranstype; + aggcollecttype = aggform->aggcollecttype; + ReleaseSysCache(aggTuple); + + /* Can not split two-phase aggregate */ + if (!OidIsValid(aggcollecttype)) + return true; + + if (IsPolymorphicType(aggtranstype)) + { + Oid *inputTypes; + Oid *declaredArgTypes; + int agg_nargs; + int numArgs; + ListCell *l; + + inputTypes = (Oid *) palloc(sizeof(Oid) * list_length(aggref->args)); + numArgs = 0; + foreach(l, aggref->args) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + + if (!tle->resjunk) + inputTypes[numArgs++] = exprType((Node *) tle->expr); + } + + /* have to fetch the agg's declared input types... */ + (void) get_func_signature(aggref->aggfnoid, + &declaredArgTypes, &agg_nargs); + Assert(agg_nargs == numArgs); + + + aggtranstype = enforce_generic_type_consistency(inputTypes, + declaredArgTypes, + agg_nargs, + aggtranstype, + false); + pfree(inputTypes); + pfree(declaredArgTypes); + } + newagg = copyObject(aggref); + newagg->aggtype = aggtranstype; + + newtle = makeTargetEntry((Expr *) newagg, + list_length(context->newtlist) + 1, + NULL, + false); + context->newtlist = lappend(context->newtlist, newtle); + } + + return false; + } + /* + * If expression is in the subtlist copy it into new tlist + */ + tle = tlist_member(node, context->subtlist); + if (tle && !tlist_member((Node *) tle->expr, context->newtlist)) + { + TargetEntry *newtle; + newtle = makeTargetEntry((Expr *) copyObject(node), + list_length(context->newtlist) + 1, + tle->resname, + false); + context->newtlist = lappend(context->newtlist, newtle); + return false; + } + if (IsA(node, Var)) + { + /* + * Referenced Var is not a member of subtlist. + * Go ahead and add junk one. + */ + TargetEntry *newtle; + newtle = makeTargetEntry((Expr *) copyObject(node), + list_length(context->newtlist) + 1, + NULL, + true); + context->newtlist = lappend(context->newtlist, newtle); + return false; + } + return expression_tree_walker(node, find_referenced_cols_walker, + (void *) context); +} +#endif + + Agg * make_agg(PlannerInfo *root, List *tlist, List *qual, AggStrategy aggstrategy, const AggClauseCosts *aggcosts, @@ -4099,6 +5976,9 @@ make_agg(PlannerInfo *root, List *tlist, List *qual, Plan *plan = &node->plan; Path agg_path; /* dummy for result of cost_agg */ QualCost qual_cost; +#ifdef XCP + RemoteSubplan *pushdown; +#endif node->aggstrategy = aggstrategy; node->numCols = numGroupCols; @@ -4151,6 +6031,141 @@ make_agg(PlannerInfo *root, List *tlist, List *qual, plan->lefttree = lefttree; plan->righttree = NULL; +#ifdef XCP + /* + * If lefttree is a distributed subplan we may optimize aggregates by + * pushing down transition phase to remote data notes, and therefore reduce + * traffic and distribute evaluation load. + * We need to find all Var and Aggref expressions in tlist and qual and make + * up a new tlist from these expressions. Update original Vars. + * Create new Agg node with the new tlist and aggdistribution AGG_SLAVE. + * Set new Agg node as a lefttree of the distributed subplan, moving + * existing lefttree down under the new Agg node. Set new tlist to the + * distributed subplan - it should be matching to the subquery. + * Set node's aggdistribution to AGG_MASTER and continue node initialization + */ + pushdown = find_push_down_plan(lefttree, true); + if (pushdown) + { + find_referenced_cols_context context; + + context.subtlist = pushdown->scan.plan.targetlist; + context.newtlist = NIL; + if (find_referenced_cols_walker((Node *) tlist, &context) || + find_referenced_cols_walker((Node *) qual, &context)) + { + /* + * We found we can not push down this aggregate, clean up and + * fallback to default procedure + */ + node->aggdistribution = AGG_ONENODE; + } + else + { + Agg *phase1 = makeNode(Agg); + Plan *plan1 = &phase1->plan; + int i; + + phase1->aggdistribution = AGG_SLAVE; + phase1->aggstrategy = aggstrategy; + phase1->numCols = numGroupCols; + phase1->grpColIdx = grpColIdx; + phase1->grpOperators = grpOperators; + phase1->numGroups = numGroups; + + /* + * If we perform grouping we should make sure the grouping + * expressions are in the new tlist, and we should update indexes + * for the Phase2 aggregation node + */ + if (numGroupCols > 0) + { + AttrNumber *newGrpColIdx; + newGrpColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) + * numGroupCols); + for (i = 0; i < numGroupCols; i++) + { + TargetEntry *tle; + TargetEntry *newtle; + + tle = (TargetEntry *) list_nth(context.subtlist, + grpColIdx[i] - 1); + newtle = tlist_member((Node *) tle->expr, context.newtlist); + if (newtle == NULL) + { + newtle = makeTargetEntry((Expr *) copyObject(tle->expr), + list_length(context.newtlist) + 1, + tle->resname, + false); + context.newtlist = lappend(context.newtlist, newtle); + } + newGrpColIdx[i] = newtle->resno; + } + node->grpColIdx = newGrpColIdx; + } + + /* + * If the pushdown plan is sorting update sort column indexes + */ + if (pushdown->sort) + { + SimpleSort *ssort = pushdown->sort; + for (i = 0; i < ssort->numCols; i++) + { + TargetEntry *tle; + TargetEntry *newtle; + + tle = (TargetEntry *) list_nth(context.subtlist, + grpColIdx[i] - 1); + newtle = tlist_member((Node *) tle->expr, context.newtlist); + if (newtle == NULL) + { + /* XXX maybe we should just remove the sort key ? */ + newtle = makeTargetEntry((Expr *) copyObject(tle->expr), + list_length(context.newtlist) + 1, + tle->resname, + false); + context.newtlist = lappend(context.newtlist, newtle); + } + ssort->sortColIdx[i] = newtle->resno; + } + } + + copy_plan_costsize(plan1, (Plan *) pushdown); // ??? + + /* + * We will produce a single output tuple if not grouping, and a tuple per + * group otherwise. + */ + if (aggstrategy == AGG_PLAIN) + plan1->plan_rows = 1; + else + plan1->plan_rows = numGroups; + + plan1->targetlist = context.newtlist; + plan1->qual = NIL; + plan1->lefttree = pushdown->scan.plan.lefttree; + pushdown->scan.plan.lefttree = plan1; + plan1->righttree = NULL; + + /* + * Update target lists of all plans from lefttree till phase1. + * All they should be the same if the tree is transparent for push + * down modification. + */ + while (lefttree != plan1) + { + lefttree->targetlist = context.newtlist; + lefttree = lefttree->lefttree; + } + + node->aggdistribution = AGG_MASTER; + } + } + else + node->aggdistribution = AGG_ONENODE; +#endif + return node; } @@ -4285,6 +6300,9 @@ make_unique(Plan *lefttree, List *distinctList) AttrNumber *uniqColIdx; Oid *uniqOperators; ListCell *slitem; +#ifdef XCP + RemoteSubplan *pushdown; +#endif copy_plan_costsize(plan, lefttree); @@ -4329,6 +6347,30 @@ make_unique(Plan *lefttree, List *distinctList) node->uniqColIdx = uniqColIdx; node->uniqOperators = uniqOperators; +#ifdef XCP + /* + * We want to filter out duplicates on nodes to reduce amount of data sent + * over network and reduce coordinator load. + */ + pushdown = find_push_down_plan(lefttree, true); + if (pushdown) + { + Unique *node1 = makeNode(Unique); + Plan *plan1 = &node1->plan; + + copy_plan_costsize(plan1, pushdown->scan.plan.lefttree); + plan1->targetlist = pushdown->scan.plan.lefttree->targetlist; + plan1->qual = NIL; + plan1->lefttree = pushdown->scan.plan.lefttree; + pushdown->scan.plan.lefttree = plan1; + plan1->righttree = NULL; + + node1->numCols = numCols; + node1->uniqColIdx = uniqColIdx; + node1->uniqOperators = uniqOperators; + } +#endif + return node; } @@ -4434,6 +6476,9 @@ make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount, { Limit *node = makeNode(Limit); Plan *plan = &node->plan; +#ifdef XCP + RemoteSubplan *pushdown; +#endif copy_plan_costsize(plan, lefttree); @@ -4492,6 +6537,37 @@ make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount, node->limitOffset = limitOffset; node->limitCount = limitCount; +#ifdef XCP + if ((limitOffset == NULL || offset_est > 0) && + (limitCount == NULL || count_est > 0)) + { + /* + * We may reduce amount of rows sent over the network and do not send more + * rows then necessary + */ + pushdown = find_push_down_plan(lefttree, true); + if (pushdown) + { + Limit *node1 = makeNode(Limit); + Plan *plan1 = &node1->plan; + + copy_plan_costsize(plan1, pushdown->scan.plan.lefttree); + plan1->targetlist = pushdown->scan.plan.lefttree->targetlist; + plan1->qual = NIL; + plan1->lefttree = pushdown->scan.plan.lefttree; + pushdown->scan.plan.lefttree = plan1; + plan1->righttree = NULL; + + node1->limitOffset = NULL; + node1->limitCount = (Node *) makeConst(INT8OID, -1, + InvalidOid, + sizeof(int64), + Int64GetDatum(offset_est + count_est), + false, FLOAT8PASSBYVAL); + } + } +#endif + return node; } @@ -4538,6 +6614,73 @@ make_result(PlannerInfo *root, plan->righttree = NULL; node->resconstantqual = resconstantqual; +#ifdef XCP + if (subplan) + { + /* + * We do not gain performance when pushing down Result, but Result on + * top of RemoteSubplan would not allow to push down other plan nodes + */ + RemoteSubplan *pushdown; + pushdown = find_push_down_plan(subplan, true); + if (pushdown) + { + /* + * Avoid pushing down results if the RemoteSubplan performs merge + * sort. + */ + if (pushdown->sort) + return node; + + /* + * If remote subplan is generating distribution we should keep it + * correct. Set valid expression as a distribution key. + */ + if (pushdown->distributionKey != InvalidAttrNumber) + { + ListCell *lc; + TargetEntry *key; + + key = list_nth(pushdown->scan.plan.targetlist, + pushdown->distributionKey); + pushdown->distributionKey = InvalidAttrNumber; + foreach(lc, tlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(lc); + if (equal(tle->expr, key->expr)) + { + pushdown->distributionKey = tle->resno; + break; + } + } + + if (pushdown->distributionKey != InvalidAttrNumber) + { + /* Not found, adding */ + TargetEntry *newtle; + /* + * The target entry is *NOT* junk to ensure it is not + * filtered out before sending from the data node. + */ + newtle = makeTargetEntry(copyObject(key->expr), + list_length(tlist) + 1, + key->resname, + false); + tlist = lappend(tlist, newtle); + /* just in case if it was NIL */ + plan->targetlist = tlist; + pushdown->distributionKey = newtle->resno; + } + } + /* This will be set as lefttree of the Result plan */ + plan->lefttree = pushdown->scan.plan.lefttree; + pushdown->scan.plan.lefttree = plan; + /* Now RemoteSubplan returns different values */ + pushdown->scan.plan.targetlist = tlist; + return (Result *) subplan; + } + } +#endif /* XCP */ return node; } @@ -4628,39 +6771,1502 @@ is_projection_capable_plan(Plan *plan) case T_MergeAppend: case T_RecursiveUnion: return false; +#ifdef XCP + /* + * Remote subplan may push down projection to the data nodes if do not + * performs merge sort + */ + case T_RemoteSubplan: + return ((RemoteSubplan *) plan)->sort == NULL && + is_projection_capable_plan(plan->lefttree); +#endif default: break; } return true; } + +#ifdef XCP +#define CNAME_MAXLEN 32 +static int cursor_id = 0; + + +/* + * Return a name unique for the cluster + */ +static char * +get_internal_cursor(void) +{ + char *cursor; + + cursor = (char *) palloc(CNAME_MAXLEN); + if (cursor_id++ == INT_MAX) + cursor_id = 0; + + snprintf(cursor, CNAME_MAXLEN - 1, "p_%d_%x_%x", + PGXCNodeId, getpid(), cursor_id); + return cursor; +} +#endif + + #ifdef PGXC +#ifndef XCP /* - * Wrapper functions to expose some functions to PGXC planner. These functions - * are meant to be wrappers just calling the static function in this file. If - * you need to add more functionality, add it to the original function. + * findReferencedVars() + * + * Constructs a list of those Vars in targetlist which are found in + * parent_vars (in other words, the intersection of targetlist and + * parent_vars). Returns a new list in *out_tlist and a bitmap of + * those relids found in the result. + * + * Additionally do look at the qual references to other vars! They + * also need to be selected.. */ -List * -pgxc_order_qual_clauses(PlannerInfo *root, List *clauses) +static void +findReferencedVars(List *parent_vars, RemoteQuery *plan, List **out_tlist, Relids *out_relids) { - return order_qual_clauses(root, clauses); + List *vars; + Relids relids = NULL; + List *tlist = NIL; + ListCell *l; + + /* Pull vars from both the targetlist and the clauses attached to this plan */ + vars = pull_var_clause((Node *)plan->base_tlist, + PVC_RECURSE_AGGREGATES, + PVC_REJECT_PLACEHOLDERS); + + foreach(l, vars) + { + Var *var = lfirst(l); + + if (search_tlist_for_var(var, parent_vars)) + tlist = lappend(tlist, var); + + if (!bms_is_member(var->varno, relids)) + relids = bms_add_member(relids, var->varno); + } + + /* Now consider the local quals */ + vars = pull_var_clause((Node *)plan->scan.plan.qual, + PVC_RECURSE_AGGREGATES, + PVC_REJECT_PLACEHOLDERS); + + foreach(l, vars) + { + Var *var = lfirst(l); + + if (search_tlist_for_var(var, tlist) == NULL) + tlist = lappend(tlist, var); + + if (!bms_is_member(var->varno, relids)) + relids = bms_add_member(relids, var->varno); + } + + *out_tlist = tlist; + *out_relids = relids; } -List * -pgxc_build_relation_tlist(RelOptInfo *rel) +/* + * create_remoteinsert_plan() + * + * For every target relation, add a remote query node to carry out remote + * operations. + */ +Plan * +create_remoteinsert_plan(PlannerInfo *root, Plan *topplan) { - return build_relation_tlist(rel); + ModifyTable *mt = (ModifyTable *)topplan; + ListCell *l; + + /* We expect to work only on ModifyTable node */ + if (!IsA(topplan, ModifyTable)) + elog(ERROR, "Unexpected node type: %d", topplan->type); + + /* + * For every result relation, build a remote plan to execute remote insert. + */ + foreach(l, mt->resultRelations) + { + Index resultRelationIndex = lfirst_int(l); + RangeTblEntry *ttab; + RelationLocInfo *rel_loc_info; + StringInfo buf, buf2; + RemoteQuery *fstep; + Oid nspid; + char *nspname; + int natts, att; + Oid *att_types; + char *relname; + bool first_att_printed = false; + + ttab = rt_fetch(resultRelationIndex, root->parse->rtable); + + /* Bad relation ? */ + if (ttab == NULL || ttab->rtekind != RTE_RELATION) + continue; + + /* Get location info of the target table */ + rel_loc_info = GetRelationLocInfo(ttab->relid); + if (rel_loc_info == NULL) + continue; + + /* For main string */ + buf = makeStringInfo(); + /* For values */ + buf2 = makeStringInfo(); + + /* Compose INSERT FROM target_table */ + nspid = get_rel_namespace(ttab->relid); + nspname = get_namespace_name(nspid); + relname = get_rel_name(ttab->relid); + + /* + * Do not qualify with namespace for TEMP tables. The schema name may + * vary on each node + */ + if (IsTempTable(ttab->relid)) + appendStringInfo(buf, "INSERT INTO %s (", + quote_identifier(relname)); + else + appendStringInfo(buf, "INSERT INTO %s.%s (", quote_identifier(nspname), + quote_identifier(relname)); + + fstep = make_remotequery(NIL, NIL, resultRelationIndex); + fstep->is_temp = IsTempTable(ttab->relid); + + natts = get_relnatts(ttab->relid); + att_types = (Oid *) palloc0 (sizeof (Oid) * natts); + + /* + * Populate the column information + */ + for (att = 1; att <= natts; att++) + { + HeapTuple tp; + + tp = SearchSysCache(ATTNUM, + ObjectIdGetDatum(ttab->relid), + Int16GetDatum(att), + 0, 0); + if (HeapTupleIsValid(tp)) + { + Form_pg_attribute att_tup = (Form_pg_attribute) GETSTRUCT(tp); + + /* Bypass dropped attributes in query */ + if (att_tup->attisdropped) + { + /* Dropped attributes are casted as int4 in prepared parameters */ + att_types[att - 1] = INT4OID; + } + else + { + /* Add comma before all except first attributes */ + if (first_att_printed) + appendStringInfoString(buf, ", "); + + /* Build the value part, parameters are filled at run time */ + if (first_att_printed) + appendStringInfoString(buf2, ", "); + + first_att_printed = true; + + /* Append column name */ + appendStringInfoString(buf, quote_identifier(NameStr(att_tup->attname))); + + /* Append value in string */ + appendStringInfo(buf2, "$%d", att); + + /* Assign parameter type */ + att_types[att - 1] = att_tup->atttypid; + } + + ReleaseSysCache(tp); + } + else + elog(ERROR, "cache lookup failed for attribute %d of relation %u", + att, ttab->relid); + } + + /* Gather the two strings */ + appendStringInfo(buf, ") VALUES (%s)", buf2->data); + + fstep->sql_statement = pstrdup(buf->data); + + /* Processed rows are counted by the main planner */ + fstep->combine_type = COMBINE_TYPE_NONE; + + fstep->read_only = false; + fstep->exec_nodes = makeNode(ExecNodes); + fstep->exec_nodes->baselocatortype = rel_loc_info->locatorType; + fstep->exec_nodes->primarynodelist = NULL; + fstep->exec_nodes->nodeList = rel_loc_info->nodeList; + fstep->exec_nodes->en_relid = ttab->relid; + fstep->exec_nodes->accesstype = RELATION_ACCESS_INSERT; + fstep->exec_nodes->en_expr = pgxc_set_en_expr(ttab->relid, resultRelationIndex); + + SetRemoteStatementName((Plan *) fstep, NULL, natts, att_types, 0); + + /* Free everything */ + pfree(buf->data); + pfree(buf); + pfree(buf2->data); + pfree(buf2); + + mt->remote_plans = lappend(mt->remote_plans, fstep); + } + + return topplan; } -void -pgxc_copy_path_costsize(Plan *dest, Path *src) + +/* + * create_remoteupdate_plan() + * + * For every target relation, add a remote query node to carry out remote + * operations. + * WHERE and SET clauses are populated with the relation attributes. + * Target list is used for SET clause and completed with the expressions already given + * Those are the non-junk expressions in target list of parser tree. + * WHERE clause is completed by the other expressions in target tree that have been + * marked as junk during target list rewriting to be able to identify consistently + * tuples on remote Coordinators. This target list is based on the information obtained + * from the inner plan that should be generated by create_remotequery_plan. + */ +Plan * +create_remoteupdate_plan(PlannerInfo *root, Plan *topplan) { - copy_path_costsize(dest, src); + ModifyTable *mt = (ModifyTable *)topplan; + ListCell *l; + + /* We expect to work only on ModifyTable node */ + if (!IsA(topplan, ModifyTable)) + elog(ERROR, "Unexpected node type: %d", topplan->type); + + /* + * For every result relation, build a remote plan to execute remote update. + */ + foreach(l, mt->resultRelations) + { + Index resultRelationIndex = lfirst_int(l); + Query *parse = root->parse; + RangeTblEntry *ttab; + RelationLocInfo *rel_loc_info; + StringInfo buf, buf2; + Oid nspid; /* Relation namespace Oid */ + char *nspname; /* Relation namespace name */ + Oid *param_types; /* Types of query parameters */ + bool is_set_printed = false; /* Control of SET generation */ + bool is_where_printed = false; /* Control of WHERE generation */ + RemoteQuery *fstep; /* Plan step generated */ + ListCell *elt; + int count = 0, where_count = 1; + int natts, count_prepparams, tot_prepparams; + char *relname; + + ttab = rt_fetch(resultRelationIndex, parse->rtable); + + /* Bad relation ? */ + if (ttab == NULL || ttab->rtekind != RTE_RELATION) + continue; + + relname = get_rel_name(ttab->relid); + + /* Get location info of the target table */ + rel_loc_info = GetRelationLocInfo(ttab->relid); + if (rel_loc_info == NULL) + continue; + + /* Create query buffers */ + buf = makeStringInfo(); /* For SET clause */ + buf2 = makeStringInfo(); /* For WHERE clause */ + + /* Compose UPDATE target_table */ + natts = get_relnatts(ttab->relid); + nspid = get_rel_namespace(ttab->relid); + nspname = get_namespace_name(nspid); + + /* + * Do not qualify with namespace for TEMP tables. The schema name may + * vary on each node + */ + if (IsTempTable(ttab->relid)) + appendStringInfo(buf, "UPDATE ONLY %s SET ", + quote_identifier(relname)); + else + appendStringInfo(buf, "UPDATE ONLY %s.%s SET ", quote_identifier(nspname), + quote_identifier(relname)); + + /* + * Count the number of junk entries before setting the parameter type list. + * This helps to know how many parameters part of the WHERE clause need to + * be sent down by extended query protocol. + */ + foreach(elt, parse->targetList) + { + TargetEntry *tle = lfirst(elt); + if (tle->resjunk) + count++; + } + count_prepparams = natts + count; + /* Count entries related to Rowmarks */ + tot_prepparams = count_prepparams + pgxc_count_rowmarks_entries(root->rowMarks); + + /* Then allocate the array for this purpose */ + param_types = (Oid *) palloc0(sizeof (Oid) * tot_prepparams); + + /* + * Now build the query based on the target list. SET clause is completed + * by non-junk entries and WHERE clause by junk entries used to identify + * uniquely tuples on remote nodes. + */ + foreach(elt, parse->targetList) + { + TargetEntry *tle = lfirst(elt); + + if (!tle->resjunk) + { + int attno = 0; + int i; + + /* Add target list element to SET clause */ + + /* Add comma before all except first attributes */ + if (!is_set_printed) + is_set_printed = true; + else + appendStringInfoString(buf, ", "); + + /* We need first to find the position of this element in attribute list */ + for (i = 0; i < natts; i++) + { + if (strcmp(tle->resname, + get_relid_attribute_name(ttab->relid, i + 1)) == 0) + { + attno = i + 1; + break; + } + } + + /* Complete string */ + appendStringInfo(buf, "%s = $%d", + tle->resname, + attno); + + /* Set parameter type correctly */ + param_types[attno - 1] = exprType((Node *) tle->expr); + } + else + { + /* Set parameter type */ + param_types[natts + where_count - 1] = exprType((Node *) tle->expr); + where_count++; + + /* + * ctid and xc_node_id are sufficient to identify + * remote tuple. + */ + if (strcmp(tle->resname, "xc_node_id") != 0 && + strcmp(tle->resname, "ctid") != 0) + continue; + + /* Set the clause if necessary */ + if (!is_where_printed) + { + is_where_printed = true; + appendStringInfoString(buf2, " WHERE "); + } + else + appendStringInfoString(buf2, "AND "); + + /* Complete string */ + appendStringInfo(buf2, "%s = $%d ", + tle->resname, + natts + where_count - 1); + } + } + + /* + * Before finalizing query be sure that there are no missing entries for attributes. + * If there are complete the last holes. Those ones are mandatory to insure that + * update is executed consistently. + */ + for (count = 1; count <= natts; count++) + { + if (param_types[count - 1] == 0) + { + HeapTuple tp; + + tp = SearchSysCache(ATTNUM, + ObjectIdGetDatum(ttab->relid), + Int16GetDatum(count), + 0, 0); + + if (HeapTupleIsValid(tp)) + { + Form_pg_attribute att_saved = (Form_pg_attribute) GETSTRUCT(tp); + + /* + * Set parameter type of attribute + * Dropped columns are casted as int4 + */ + if (att_saved->attisdropped) + param_types[count - 1] = INT4OID; + else + param_types[count - 1] = att_saved->atttypid; + ReleaseSysCache(tp); + } + else + elog(ERROR, "cache lookup failed for attribute %d of relation %u", + count, ttab->relid); + } + } + + /* + * The query needs to be completed by nullifying the non-parent entries + * defined in RowMarks. This is essential for UPDATE queries running with child + * entries as we need to bypass them correctly at executor level. + */ + param_types = pgxc_build_rowmark_entries(root->rowMarks, parse->rtable, param_types, + count_prepparams, tot_prepparams); + + /* Finish building the query by gathering SET and WHERE clauses */ + appendStringInfo(buf, "%s", buf2->data); + + /* Finally build the final UPDATE step */ + fstep = make_remotequery(parse->targetList, NIL, resultRelationIndex); + fstep->is_temp = IsTempTable(ttab->relid); + fstep->sql_statement = pstrdup(buf->data); + fstep->combine_type = COMBINE_TYPE_NONE; + + fstep->read_only = false; + /* + * Get the nodes to execute the query on. We will execute this query on + * all nodes. The WHERE condition will take care of updating the columns + * accordingly. + */ + fstep->exec_nodes = GetRelationNodes(rel_loc_info, 0, true, UNKNOWNOID, RELATION_ACCESS_UPDATE); + fstep->exec_nodes->baselocatortype = rel_loc_info->locatorType; + fstep->exec_nodes->en_relid = ttab->relid; + fstep->exec_nodes->nodeList = rel_loc_info->nodeList; + fstep->exec_nodes->accesstype = RELATION_ACCESS_UPDATE; + fstep->exec_nodes->en_expr = pgxc_set_en_expr(ttab->relid, resultRelationIndex); + SetRemoteStatementName((Plan *) fstep, NULL, tot_prepparams, param_types, 0); + pfree(buf->data); + pfree(buf2->data); + pfree(buf); + pfree(buf2); + + mt->remote_plans = lappend(mt->remote_plans, fstep); + } + + return topplan; +} + +/* + * create_remotedelete_plan() + * + * For every target relation, add a remote query node to carry out remote + * operations. The tuple to be deleted is selected depending on the target + * list of given plan, generating parametrized WHERE clause in consequence. + */ +Plan * +create_remotedelete_plan(PlannerInfo *root, Plan *topplan) +{ + ModifyTable *mt = (ModifyTable *)topplan; + ListCell *l; + + /* We expect to work only on ModifyTable node */ + if (!IsA(topplan, ModifyTable)) + elog(ERROR, "Unexpected node type: %d", topplan->type); + + /* + * For every result relation, build a remote plan to execute remote delete. + */ + foreach(l, mt->resultRelations) + { + Index resultRelationIndex = lfirst_int(l); + Query *parse = root->parse; + RangeTblEntry *ttab; + RelationLocInfo *rel_loc_info; + StringInfo buf; + Oid nspid; /* Relation namespace Oid */ + char *nspname; /* Relation namespace name */ + int count_prepparams, tot_prepparams; /* Attribute used is CTID */ + Oid *param_types; /* Types of query parameters */ + RemoteQuery *fstep; /* Plan step generated */ + bool is_where_created = false; + ListCell *elt; + int count = 1; + char *relname; + + ttab = rt_fetch(resultRelationIndex, parse->rtable); + + /* Bad relation ? */ + if (ttab == NULL || ttab->rtekind != RTE_RELATION) + continue; + + /* Get location info of the target table */ + rel_loc_info = GetRelationLocInfo(ttab->relid); + if (rel_loc_info == NULL) + continue; + + /* Create query buffers */ + buf = makeStringInfo(); + + /* Compose DELETE target_table */ + nspid = get_rel_namespace(ttab->relid); + nspname = get_namespace_name(nspid); + relname = get_rel_name(ttab->relid); + + /* Parameters are defined by target list */ + count_prepparams = list_length(parse->targetList); + + /* Count entries related to Rowmarks only if there are child relations here */ + if (list_length(mt->resultRelations) != 1) + tot_prepparams = count_prepparams + pgxc_count_rowmarks_entries(root->rowMarks); + else + tot_prepparams = count_prepparams; + + param_types = (Oid *) palloc0(sizeof(Oid) * tot_prepparams); + + /* + * Do not qualify with namespace for TEMP tables. The schema name may + * vary on each node. + */ + if (IsTempTable(ttab->relid)) + appendStringInfo(buf, "DELETE FROM ONLY %s ", + quote_identifier(relname)); + else + appendStringInfo(buf, "DELETE FROM ONLY %s.%s ", quote_identifier(nspname), + quote_identifier(relname)); + + /* Generate WHERE clause for each target list item */ + foreach(elt, parse->targetList) + { + TargetEntry *tle = lfirst(elt); + + /* Set up the parameter type */ + param_types[count - 1] = exprType((Node *) tle->expr); + count++; + + /* + * In WHERE clause, ctid and xc_node_id are + * sufficient to fetch a tuple from remote node. + */ + if (strcmp(tle->resname, "xc_node_id") != 0 && + strcmp(tle->resname, "ctid") != 0) + continue; + + /* Set the clause if necessary */ + if (!is_where_created) + { + is_where_created = true; + appendStringInfoString(buf, "WHERE "); + } + else + appendStringInfoString(buf, "AND "); + + appendStringInfo(buf, "%s = $%d ", + quote_identifier(tle->resname), + count - 1); + } + + /* + * The query needs to be completed by nullifying the non-parent entries + * defined in RowMarks. This is essential for UPDATE queries running with child + * entries as we need to bypass them correctly at executor level. + */ + param_types = pgxc_build_rowmark_entries(root->rowMarks, parse->rtable, param_types, + count_prepparams, tot_prepparams); + + /* Finish by building the plan step */ + fstep = make_remotequery(parse->targetList, NIL, resultRelationIndex); + fstep->is_temp = IsTempTable(ttab->relid); + fstep->sql_statement = pstrdup(buf->data); + fstep->combine_type = COMBINE_TYPE_NONE; + + fstep->read_only = false; + /* + * Get the nodes to execute the query on. We will execute this query on + * all nodes. The WHERE condition will take care of updating the columns + * accordingly. + */ + fstep->exec_nodes = GetRelationNodes(rel_loc_info, 0, true, UNKNOWNOID, + RELATION_ACCESS_UPDATE); + fstep->exec_nodes->baselocatortype = rel_loc_info->locatorType; + fstep->exec_nodes->en_relid = ttab->relid; + fstep->exec_nodes->nodeList = rel_loc_info->nodeList; + fstep->exec_nodes->accesstype = RELATION_ACCESS_UPDATE; + SetRemoteStatementName((Plan *) fstep, NULL, tot_prepparams, param_types, 0); + pfree(buf->data); + pfree(buf); + + mt->remote_plans = lappend(mt->remote_plans, fstep); + } + + return topplan; } + +/* + * create_remotegrouping_plan + * Check if the grouping and aggregates can be pushed down to the + * Datanodes. + * Right now we can push with following restrictions + * 1. there are plain aggregates (no expressions involving aggregates) and/or + * expressions in group by clauses + * 2. No distinct or order by clauses + * 3. No windowing clause + * 4. No having clause + * + * Inputs + * root - planerInfo root for this query + * agg_plan - local grouping plan produced by grouping_planner() + * + * PGXCTODO: work on reducing these restrictions as much or document the reasons + * why we need the restrictions, in these comments themselves. In case of + * replicated tables, we should be able to push the whole query to the data + * node in case there are no local clauses. + */ Plan * -pgxc_create_gating_plan(PlannerInfo *root, Plan *plan, List *quals) +create_remotegrouping_plan(PlannerInfo *root, Plan *local_plan) +{ + Query *query = root->parse; + Sort *sort_plan; + RemoteQuery *remote_scan; /* remote query in the passed in plan */ + RemoteQuery *remote_group; /* remote query after optimization */ + Plan *remote_group_plan; /* plan portion of remote_group */ + Plan *temp_plan; + List *temp_vars; /* temporarily hold the VARs */ + List *temp_vartlist; /* temporarity hold tlist of VARs */ + ListCell *temp; + StringInfo remote_targetlist;/* SELECT clause of remote query */ + StringInfo remote_sql_stmt; + StringInfo groupby_clause; /* remote query GROUP BY */ + StringInfo orderby_clause; /* remote query ORDER BY */ + StringInfo remote_fromlist; /* remote query FROM */ + StringInfo in_alias; + StringInfo having_clause; /* remote query HAVING clause */ + Relids in_relids; /* the list of Relids referenced by lefttree */ + Index dummy_rtindex; + List *base_tlist; + RangeTblEntry *dummy_rte; + int numGroupCols; + AttrNumber *grpColIdx; + bool reduce_plan; + List *remote_qual; + List *local_qual; + + /* Remote grouping is not enabled, don't do anything */ + if (!enable_remotegroup) + return local_plan; + /* + * We don't push aggregation and grouping to Datanodes, in case there are + * windowing aggregates, distinct, having clause or sort clauses. + */ + if (query->hasWindowFuncs || + query->distinctClause || + query->sortClause) + return local_plan; + + /* for now only Agg/Group plans */ + if (local_plan && IsA(local_plan, Agg)) + { + numGroupCols = ((Agg *)local_plan)->numCols; + grpColIdx = ((Agg *)local_plan)->grpColIdx; + } + else if (local_plan && IsA(local_plan, Group)) + { + numGroupCols = ((Group *)local_plan)->numCols; + grpColIdx = ((Group *)local_plan)->grpColIdx; + } + else + return local_plan; + + /* + * We expect plan tree as Group/Agg->Sort->Result->Material->RemoteQuery, + * Result, Material nodes are optional. Sort is compulsory for Group but not + * for Agg. + * anything else is not handled right now. + */ + temp_plan = local_plan->lefttree; + remote_scan = NULL; + sort_plan = NULL; + if (temp_plan && IsA(temp_plan, Sort)) + { + sort_plan = (Sort *)temp_plan; + temp_plan = temp_plan->lefttree; + } + if (temp_plan && IsA(temp_plan, Result)) + temp_plan = temp_plan->lefttree; + if (temp_plan && IsA(temp_plan, Material)) + temp_plan = temp_plan->lefttree; + if (temp_plan && IsA(temp_plan, RemoteQuery)) + remote_scan = (RemoteQuery *)temp_plan; + + if (!remote_scan) + return local_plan; + /* + * for Group plan we expect Sort under the Group, which is always the case, + * the condition below is really for some possibly non-existent case. + */ + if (IsA(local_plan, Group) && !sort_plan) + return local_plan; + /* + * If the remote_scan has any quals on it, those need to be executed before + * doing anything. Hence we won't be able to push any aggregates or grouping + * to the Datanode. + * If it has any SimpleSort in it, then sorting is intended to be applied + * before doing anything. Hence can not push any aggregates or grouping to + * the Datanode. + */ + if (remote_scan->scan.plan.qual || remote_scan->sort) + return local_plan; + + /* + * Grouping_planner may add Sort node to sort the rows + * based on the columns in GROUP BY clause. Hence the columns in Sort and + * those in Group node in should be same. The columns are usually in the + * same order in both nodes, hence check the equality in order. If this + * condition fails, we can not handle this plan for now. + */ + if (sort_plan) + { + int cntCols; + if (sort_plan->numCols != numGroupCols) + return local_plan; + for (cntCols = 0; cntCols < numGroupCols; cntCols++) + { + if (sort_plan->sortColIdx[cntCols] != grpColIdx[cntCols]) + return local_plan; + } + } + + /* + * At last we find the plan underneath is reducible into a single + * RemoteQuery node. + */ + + /* find all the relations referenced by targetlist of Grouping node */ + temp_vars = pull_var_clause((Node *)local_plan->targetlist, + PVC_RECURSE_AGGREGATES, + PVC_REJECT_PLACEHOLDERS); + findReferencedVars(temp_vars, remote_scan, &temp_vartlist, &in_relids); + + /* + * process the targetlist of the grouping plan, also construct the + * targetlist of the query to be shipped to the remote side + */ + base_tlist = pgxc_process_grouping_targetlist(root, &(local_plan->targetlist)); + /* + * If can not construct a targetlist shippable to the Datanode. Resort to + * the plan created by grouping_planner() + */ + if (!base_tlist) + return local_plan; + + base_tlist = pgxc_process_having_clause(root, base_tlist, query->havingQual, + &local_qual, &remote_qual, &reduce_plan); + /* + * Because of HAVING clause, we can not push the aggregates and GROUP BY + * clause to the Datanode. Resort to the plan created by grouping planner. + */ + if (!reduce_plan) + return local_plan; + Assert(base_tlist); + + /* + * We are now ready to create the RemoteQuery node to push the query to + * Datanode. + * 1. Create a remote query node reflecting the query to be pushed to the + * Datanode. + * 2. Modify the Grouping node passed in, to accept the results sent by the + * Datanodes, then group and aggregate them, if needed. + */ + remote_targetlist = makeStringInfo(); + remote_sql_stmt = makeStringInfo(); + groupby_clause = makeStringInfo(); + orderby_clause = makeStringInfo(); + remote_fromlist = makeStringInfo(); + in_alias = makeStringInfo(); + having_clause = makeStringInfo(); + + appendStringInfo(in_alias, "%s_%d", "group", root->rs_alias_index); + + /* + * Build partial RemoteQuery node to be used for creating the Select clause + * to be sent to the remote node. Rest of the node will be built later + */ + remote_group = makeNode(RemoteQuery); + + /* + * Save information about the plan we are reducing. + * We may need this information later if more entries are added to it + * as part of the remote expression optimization. + */ + remote_group->inner_alias = pstrdup(in_alias->data); + remote_group->inner_reduce_level = remote_scan->reduce_level; + remote_group->inner_relids = in_relids; + remote_group->inner_statement = pstrdup(remote_scan->sql_statement); + remote_group->exec_nodes = remote_scan->exec_nodes; + /* Don't forget to increment the index for the next time around! */ + remote_group->reduce_level = root->rs_alias_index++; + /* Remember if the remote query is accessing a temporary object */ + remote_group->is_temp = remote_scan->is_temp; + + /* Generate the select clause of the remote query */ + appendStringInfoString(remote_targetlist, "SELECT"); + foreach (temp, base_tlist) + { + TargetEntry *tle = lfirst(temp); + Node *expr = (Node *)tle->expr; + + create_remote_expr(root, local_plan, remote_targetlist, expr, remote_group); + + /* If this is not last target entry, add a comma */ + if (lnext(temp)) + appendStringInfoString(remote_targetlist, ","); + } + + /* Generate the from clause of the remote query */ + appendStringInfo(remote_fromlist, " FROM (%s) %s", + remote_group->inner_statement, remote_group->inner_alias); + + /* + * Generate group by clause for the remote query and recompute the group by + * column locations. We want the tuples from remote node to be ordered by + * the grouping columns so that ExecGroup can work without any modification, + * hence create a SimpleSort structure to be added to RemoteQuery (which + * will merge the sorted results and present to Group node in sorted + * manner). + */ + if (query->groupClause) + { + int cntCols; + char *sep; + + /* + * recompute the column ids of the grouping columns, + * the group column indexes computed earlier point in the + * targetlists of the scan plans under this node. But now the grouping + * column indexes will be pointing in the targetlist of the new + * RemoteQuery, hence those need to be recomputed + */ + pgxc_locate_grouping_columns(root, base_tlist, grpColIdx); + + appendStringInfoString(groupby_clause, "GROUP BY "); + sep = ""; + for (cntCols = 0; cntCols < numGroupCols; cntCols++) + { + appendStringInfo(groupby_clause, "%s%d", sep, grpColIdx[cntCols]); + sep = ", "; + } + if (sort_plan) + { + SimpleSort *remote_sort = makeNode(SimpleSort); + /* + * reuse the arrays allocated in sort_plan to create SimpleSort + * structure. sort_plan is useless henceforth. + */ + remote_sort->numCols = sort_plan->numCols; + remote_sort->sortColIdx = sort_plan->sortColIdx; + remote_sort->sortOperators = sort_plan->sortOperators; + remote_sort->sortCollations = sort_plan->collations; + remote_sort->nullsFirst = sort_plan->nullsFirst; + appendStringInfoString(orderby_clause, "ORDER BY "); + sep = ""; + for (cntCols = 0; cntCols < remote_sort->numCols; cntCols++) + { + remote_sort->sortColIdx[cntCols] = grpColIdx[cntCols]; + appendStringInfo(orderby_clause, "%s%d", sep, + remote_sort->sortColIdx[cntCols]); + sep = ", "; + } + remote_group->sort = remote_sort; + } + } + + if (remote_qual) + { + appendStringInfoString(having_clause, "HAVING "); + create_remote_clause_expr(root, local_plan, having_clause, remote_qual, + remote_group); + } + + /* Generate the remote sql statement from the pieces */ + appendStringInfo(remote_sql_stmt, "%s %s %s %s %s", remote_targetlist->data, + remote_fromlist->data, groupby_clause->data, + orderby_clause->data, having_clause->data); + /* + * Create a dummy RTE for the remote query being created. Append the dummy + * range table entry to the range table. Note that this modifies the master + * copy the caller passed us, otherwise e.g EXPLAIN VERBOSE will fail to + * find the rte the Vars built below refer to. Also create the tuple + * descriptor for the result of this query from the base_tlist (targetlist + * we used to generate the remote node query). + */ + dummy_rte = make_dummy_remote_rte("__REMOTE_GROUP_QUERY__", + makeAlias("__REMOTE_GROUP_QUERY__", NIL)); + /* Rest will be zeroed out in makeNode() */ + root->parse->rtable = lappend(root->parse->rtable, dummy_rte); + dummy_rtindex = list_length(root->parse->rtable); + + /* Build rest of the RemoteQuery node and the plan there */ + remote_group_plan = &remote_group->scan.plan; + + /* The join targetlist becomes this node's tlist */ + remote_group_plan->targetlist = base_tlist; + remote_group_plan->lefttree = NULL; + remote_group_plan->righttree = NULL; + remote_group->scan.scanrelid = dummy_rtindex; + remote_group->sql_statement = remote_sql_stmt->data; + + /* set_plan_refs needs this later */ + remote_group->read_only = (query->commandType == CMD_SELECT && !query->hasForUpdate); + remote_group->has_row_marks = query->hasForUpdate; + remote_group->base_tlist = base_tlist; + + /* we actually need not worry about costs since this is the final plan */ + remote_group_plan->startup_cost = remote_scan->scan.plan.startup_cost; + remote_group_plan->total_cost = remote_scan->scan.plan.total_cost; + remote_group_plan->plan_rows = remote_scan->scan.plan.plan_rows; + remote_group_plan->plan_width = remote_scan->scan.plan.plan_width; + + /* + * Modify the passed in grouping plan according to the remote query we built + * Materialization is always needed for RemoteQuery in case we need to restart + * the scan. + */ + local_plan->lefttree = remote_group_plan; + local_plan->qual = local_qual; + /* indicate that we should apply collection function directly */ + if (IsA(local_plan, Agg)) + ((Agg *)local_plan)->skip_trans = true; + + return local_plan; +} + +/* + * pgxc_locate_grouping_columns + * Locates the grouping clauses in the given target list. This is very similar + * to locate_grouping_columns except that there is only one target list to + * search into. + * PGXCTODO: Can we reuse locate_grouping_columns() instead of writing this + * function? But this function is optimized to search in the same target list. + */ +static void +pgxc_locate_grouping_columns(PlannerInfo *root, List *tlist, + AttrNumber *groupColIdx) +{ + int keyno = 0; + ListCell *gl; + + /* + * No work unless grouping. + */ + if (!root->parse->groupClause) + { + Assert(groupColIdx == NULL); + return; + } + Assert(groupColIdx != NULL); + + foreach(gl, root->parse->groupClause) + { + SortGroupClause *grpcl = (SortGroupClause *) lfirst(gl); + TargetEntry *te = get_sortgroupclause_tle(grpcl, tlist); + if (!te) + elog(ERROR, "failed to locate grouping columns"); + groupColIdx[keyno++] = te->resno; + } +} + +/* + * pgxc_add_node_to_grouping_tlist + * Add the given node to the target list to be sent to the Datanode. If it's + * Aggref node, also change the passed in node to point to the Aggref node in + * the Datanode's target list + */ +static List * +pgxc_add_node_to_grouping_tlist(List *remote_tlist, Node *expr, Index ressortgroupref) { - return create_gating_plan(root, plan, quals); + TargetEntry *remote_tle; + Oid saved_aggtype = InvalidOid; + + /* + * When we add an aggregate to the remote targetlist the aggtype of such + * Aggref node is changed to aggtrantype. Hence while searching a given + * Aggref in remote targetlist, we need to change the aggtype accordingly + * and then switch it back. + */ + if (IsA(expr, Aggref)) + { + Aggref *aggref = (Aggref *)expr; + saved_aggtype = aggref->aggtype; + aggref->aggtype = aggref->aggtrantype; + } + remote_tle = tlist_member(expr, remote_tlist); + if (IsA(expr, Aggref)) + ((Aggref *)expr)->aggtype = saved_aggtype; + + if (!remote_tle) + { + remote_tle = makeTargetEntry(copyObject(expr), + list_length(remote_tlist) + 1, + NULL, + false); + /* Copy GROUP BY/SORT BY reference for the locating group by columns */ + remote_tle->ressortgroupref = ressortgroupref; + remote_tlist = lappend(remote_tlist, remote_tle); + } + else + { + if (remote_tle->ressortgroupref == 0) + remote_tle->ressortgroupref = ressortgroupref; + else if (ressortgroupref == 0) + { + /* do nothing remote_tle->ressortgroupref has the right value */ + } + else + { + /* + * if the expression's TLE already has a Sorting/Grouping reference, + * and caller has passed a non-zero one as well, better both of them + * be same + */ + Assert(remote_tle->ressortgroupref == ressortgroupref); + } + } + + /* + * Replace the args of the local Aggref with Aggref node to be + * included in RemoteQuery node, so that set_plan_refs can convert + * the args into VAR pointing to the appropriate result in the tuple + * coming from RemoteQuery node + * PGXCTODO: should we push this change in targetlists of plans + * above? + */ + if (IsA(expr, Aggref)) + { + Aggref *local_aggref = (Aggref *)expr; + Aggref *remote_aggref = (Aggref *)remote_tle->expr; + Assert(IsA(remote_tle->expr, Aggref)); + remote_aggref->aggtype = remote_aggref->aggtrantype; + /* Is copyObject() needed here? probably yes */ + local_aggref->args = list_make1(makeTargetEntry(copyObject(remote_tle->expr), + 1, NULL, + false)); + } + return remote_tlist; +} +/* + * pgxc_process_grouping_targetlist + * The function scans the targetlist to check if the we can push anything + * from the targetlist to the Datanode. Following rules govern the choice + * 1. Either all of the aggregates are pushed to the Datanode or none is pushed + * 2. If there are no aggregates, the targetlist is good to be shipped as is + * 3. If aggregates are involved in expressions, we push the aggregates to the + * Datanodes but not the involving expressions. + * + * The function constructs the targetlist for the query to be pushed to the + * Datanode. It modifies the local targetlist to point to the expressions in + * remote targetlist wherever necessary (e.g. aggregates) + * + * PGXCTODO: we should be careful while pushing the function expressions, it's + * better to push functions like strlen() which can be evaluated at the + * Datanode, but we should avoid pushing functions which can only be evaluated + * at Coordinator. + */ +static List * +pgxc_process_grouping_targetlist(PlannerInfo *root, List **local_tlist) +{ + bool shippable_remote_tlist = true; + List *remote_tlist = NIL; + List *orig_local_tlist = NIL;/* Copy original local_tlist, in case it changes */ + ListCell *temp; + + /* + * Walk through the target list and find out whether we can push the + * aggregates and grouping to Datanodes. Also while doing so, create the + * targetlist for the query to be shipped to the Datanode. Adjust the local + * targetlist accordingly. + */ + foreach(temp, *local_tlist) + { + TargetEntry *local_tle = lfirst(temp); + Node *expr = (Node *)local_tle->expr; + bool has_aggs; + + /* + * If the expression is not Aggref but involves aggregates (has Aggref + * nodes in the expression tree, we can not push the entire expression + * to the Datanode, but push those aggregates to the Datanode, if those + * aggregates can be evaluated at the Datanodes (if is_foreign_expr + * returns true for entire expression). To evaluate the rest of the + * expression, we need to fetch the values of VARs participating in the + * expression. But, if we include the VARs under the aggregate nodes, + * they may not be part of GROUP BY clause, thus generating an invalid + * query. Hence, is_foreign_expr() wouldn't collect VARs under the + * expression tree rooted under Aggref node. + * For example, the original query is + * SELECT sum(val) * val2 FROM tab1 GROUP BY val2; + * the query pushed to the Datanode is + * SELECT sum(val), val2 FROM tab1 GROUP BY val2; + * Notice that, if we include val in the query, it will become invalid. + */ + if (!pgxc_is_expr_shippable((Expr *)expr, &has_aggs)) + { + shippable_remote_tlist = false; + break; + } + + /* + * We are about to change the local_tlist, check if we have already + * copied original local_tlist, if not take a copy + */ + if (!orig_local_tlist && has_aggs) + orig_local_tlist = copyObject(*local_tlist); + + /* + * If there are aggregates involved in the expression, whole expression + * can not be pushed to the Datanode. Pick up the aggregates and the + * VAR nodes not covered by aggregates. + */ + if (has_aggs) + { + ListCell *lcell; + List *aggs_n_vars; + /* + * This expression is not going to be pushed as whole, thus other + * clauses won't be able to find out this TLE in the results + * obtained from Datanode. Hence can't optimize this query. + * PGXCTODO: with projection support in RemoteQuery node, this + * condition can be worked around, please check. + */ + if (local_tle->ressortgroupref > 0) + { + shippable_remote_tlist = false; + break; + } + + aggs_n_vars = pull_var_clause(expr, PVC_INCLUDE_AGGREGATES, + PVC_RECURSE_PLACEHOLDERS); + /* copy the aggregates into the remote target list */ + foreach (lcell, aggs_n_vars) + { + Assert(IsA(lfirst(lcell), Aggref) || IsA(lfirst(lcell), Var)); + remote_tlist = pgxc_add_node_to_grouping_tlist(remote_tlist, lfirst(lcell), + 0); + } + } + /* Expression doesn't contain any aggregate */ + else + remote_tlist = pgxc_add_node_to_grouping_tlist(remote_tlist, expr, + local_tle->ressortgroupref); + } + + if (!shippable_remote_tlist) + { + /* + * If local_tlist has changed but we didn't find anything shippable to + * Datanode, we need to restore the local_tlist to original state, + */ + if (orig_local_tlist) + *local_tlist = orig_local_tlist; + if (remote_tlist) + list_free_deep(remote_tlist); + remote_tlist = NIL; + } + else if (orig_local_tlist) + { + /* + * If we have changed the targetlist passed, we need to pass back the + * changed targetlist. Free the copy that has been created. + */ + list_free_deep(orig_local_tlist); + } + + return remote_tlist; +} + +/* + * pgxc_process_having_clause + * For every expression in the havingQual take following action + * 1. If it has aggregates, which can be evaluated at the Datanodes, add those + * aggregates to the targetlist and modify the local aggregate expressions to + * point to the aggregate expressions being pushed to the Datanode. Add this + * expression to the local qual to be evaluated locally. + * 2. If the expression does not have aggregates and the whole expression can be + * evaluated at the Datanode, add the expression to the remote qual to be + * evaluated at the Datanode. + * 3. If qual contains an expression which can not be evaluated at the data + * node, the parent group plan can not be reduced to a remote_query. + */ +static List * +pgxc_process_having_clause(PlannerInfo *root, List *remote_tlist, Node *havingQual, + List **local_qual, List **remote_qual, + bool *reduce_plan) +{ + List *qual; + ListCell *temp; + + *reduce_plan = true; + *remote_qual = NIL; + *local_qual = NIL; + + if (!havingQual) + return remote_tlist; + /* + * PGXCTODO: we expect the quals in the form of List only. Is there a + * possibility that the quals will be another form? + */ + if (!IsA(havingQual, List)) + { + *reduce_plan = false; + return remote_tlist; + } + /* + * Copy the havingQual so that the copy can be modified later. In case we + * back out in between, the original expression remains intact. + */ + qual = copyObject(havingQual); + foreach(temp, qual) + { + Node *expr = lfirst(temp); + bool has_aggs; + List *vars_n_aggs; + + if (!pgxc_is_expr_shippable((Expr *)expr, &has_aggs)) + { + *reduce_plan = false; + break; + } + + if (has_aggs) + { + ListCell *lcell; + + /* Pull the aggregates and var nodes from the quals */ + vars_n_aggs = pull_var_clause(expr, PVC_INCLUDE_AGGREGATES, + PVC_RECURSE_PLACEHOLDERS); + /* copy the aggregates into the remote target list */ + foreach (lcell, vars_n_aggs) + { + Assert(IsA(lfirst(lcell), Aggref) || IsA(lfirst(lcell), Var)); + remote_tlist = pgxc_add_node_to_grouping_tlist(remote_tlist, lfirst(lcell), + 0); + } + *local_qual = lappend(*local_qual, expr); + } + else + *remote_qual = lappend(*remote_qual, expr); + } + + if (!(*reduce_plan)) + list_free_deep(qual); + + return remote_tlist; +} + +/* + * pgxc_set_en_expr + * Try to find the expression of distribution column to calculate node at plan execution + */ +static Expr * +pgxc_set_en_expr(Oid tableoid, Index resultRelationIndex) +{ + HeapTuple tp; + Form_pg_attribute partAttrTup; + Var *var; + RelationLocInfo *rel_loc_info; + + /* Get location info of the target table */ + rel_loc_info = GetRelationLocInfo(tableoid); + if (rel_loc_info == NULL) + return NULL; + + /* + * For hash/modulo distributed tables, the target node must be selected + * at the execution time based on the partition column value. + * + * For round robin distributed tables, tuples must be divided equally + * between the nodes. + * + * For replicated tables, tuple must be inserted in all the Datanodes + * + * XXX Need further testing for replicated and round-robin tables + */ + if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH && + rel_loc_info->locatorType != LOCATOR_TYPE_MODULO) + return NULL; + + tp = SearchSysCache(ATTNUM, + ObjectIdGetDatum(tableoid), + Int16GetDatum(rel_loc_info->partAttrNum), + 0, 0); + partAttrTup = (Form_pg_attribute) GETSTRUCT(tp); + + /* + * Create a Var for the distribution column and set it for + * execution time evaluation of target node. ExecEvalVar() picks + * up values from ecxt_scantuple if Var does not refer either OUTER + * or INNER varno. We utilize that mechanism to pick up values from + * the tuple returned by the current plan node + */ + var = makeVar(resultRelationIndex, + rel_loc_info->partAttrNum, + partAttrTup->atttypid, + partAttrTup->atttypmod, + partAttrTup->attcollation, + 0); + ReleaseSysCache(tp); + + return (Expr *) var; +} + +/* + * pgxc_count_rowmarks_entries + * Count the number of rowmarks that need to be added as prepared parameters + * for remote DML plan + */ +static int +pgxc_count_rowmarks_entries(List *rowMarks) +{ + int res = 0; + ListCell *elt; + + foreach(elt, rowMarks) + { + PlanRowMark *rc = (PlanRowMark *) lfirst(elt); + + /* RowMarks with different parent are not needed */ + if (rc->rti != rc->prti) + continue; + + /* + * Count the entry and move to next element + * For a non-parent rowmark, only ctid is used. + * For a parent rowmark, ctid and tableoid are used. + */ + if (!rc->isParent) + res++; + else + res = res + 2; + } + + return res; +} + +/* + * pgxc_build_rowmark_entries + * Complete type array for SetRemoteStatementName based on given RowMarks list + * The list of total parameters is calculated based on the current number of prepared + * parameters and the rowmark list. + */ +static Oid * +pgxc_build_rowmark_entries(List *rowMarks, List *rtable, Oid *types, int prepparams, int totparams) +{ + Oid *newtypes = types; + int rowmark_entry_num; + int count = prepparams; + ListCell *elt; + + /* No modifications is list is empty */ + if (rowMarks == NIL) + return newtypes; + + /* Nothing to do, total number of parameters is already correct */ + if (prepparams == totparams) + return newtypes; + + /* Fetch number of extra entries related to Rowmarks */ + rowmark_entry_num = pgxc_count_rowmarks_entries(rowMarks); + + /* Nothing to do */ + if (rowmark_entry_num == 0) + return newtypes; + + /* This needs to be absolutely verified */ + Assert(totparams == (prepparams + rowmark_entry_num)); + + foreach(elt, rowMarks) + { + PlanRowMark *rc = (PlanRowMark *) lfirst(elt); + + /* RowMarks with different parent are not needed */ + if (rc->rti != rc->prti) + continue; + + /* Determine the correct parameter type */ + switch (rc->markType) + { + case ROW_MARK_COPY: + { + RangeTblEntry *rte = rt_fetch(rc->prti, rtable); + + /* + * PGXCTODO: We still need to determine the rowtype + * in case relation involved here is a view (see inherit.sql). + */ + if (!OidIsValid(rte->relid)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Cannot generate remote query plan"), + errdetail("This relation rowtype cannot be fetched"))); + + /* + * This is the complete copy of a row, so it is necessary + * to set parameter as a rowtype + */ + count++; + newtypes[count - 1] = get_rel_type_id(rte->relid); + } + break; + + case ROW_MARK_REFERENCE: + /* Here we have a ctid for sure */ + count++; + newtypes[count - 1] = TIDOID; + + if (rc->isParent) + { + /* For a parent table, tableoid is also necessary */ + count++; + /* Set parameter type */ + newtypes[count - 1] = OIDOID; + } + break; + + /* Ignore other entries */ + case ROW_MARK_SHARE: + case ROW_MARK_EXCLUSIVE: + default: + break; + } + } + + /* This should not happen */ + if (count != totparams) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("Error when generating remote query plan"))); + + return newtypes; +} + +static RangeTblEntry * +make_dummy_remote_rte(char *relname, Alias *alias) +{ + RangeTblEntry *dummy_rte = makeNode(RangeTblEntry); + dummy_rte->rtekind = RTE_REMOTE_DUMMY; + + /* use a dummy relname... */ + dummy_rte->relname = relname; + dummy_rte->eref = alias; + + return dummy_rte; } +#endif /* XCP */ #endif /* PGXC */ diff --git a/src/backend/optimizer/plan/planagg.c b/src/backend/optimizer/plan/planagg.c index be52d16ff0..c0394f787c 100644 --- a/src/backend/optimizer/plan/planagg.c +++ b/src/backend/optimizer/plan/planagg.c @@ -17,6 +17,11 @@ * scan all the rows anyway. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -519,6 +524,16 @@ make_agg_subplan(PlannerInfo *root, MinMaxAggInfo *mminfo) plan->targetlist = subparse->targetList; +#ifdef XCP + /* Set plan distribution */ + if (mminfo->path->distribution) + { + plan = (Plan *) make_remotesubplan(subroot, plan, NULL, + mminfo->path->distribution, + mminfo->path->pathkeys); + } +#endif + plan = (Plan *) make_limit(plan, subparse->limitOffset, subparse->limitCount, diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 5ff4f501c2..b8e8f6fc11 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -3,6 +3,11 @@ * planner.c * The query optimizer external interface. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -41,7 +46,7 @@ #ifdef PGXC #include "commands/prepare.h" #include "pgxc/pgxc.h" -#include "optimizer/pgxcplan.h" +#include "pgxc/planner.h" #endif @@ -104,9 +109,18 @@ static void get_column_info_for_window(PlannerInfo *root, WindowClause *wc, int *ordNumCols, AttrNumber **ordColIdx, Oid **ordOperators); +#ifdef XCP +static Plan *grouping_distribution(PlannerInfo *root, Plan *plan, + int numGroupCols, AttrNumber *groupColIdx, + List *current_pathkeys, Distribution **distribution); +static bool equal_distributions(PlannerInfo *root, Distribution *dst1, + Distribution *dst2); +#endif #ifdef PGXC +#ifndef XCP static void separate_rowmarks(PlannerInfo *root); #endif +#endif /***************************************************************************** * @@ -130,6 +144,7 @@ planner(Query *parse, int cursorOptions, ParamListInfo boundParams) result = (*planner_hook) (parse, cursorOptions, boundParams); else #ifdef PGXC +#ifndef XCP /* * A Coordinator receiving a query from another Coordinator * is not allowed to go into PGXC planner. @@ -137,7 +152,8 @@ planner(Query *parse, int cursorOptions, ParamListInfo boundParams) if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) result = pgxc_planner(parse, cursorOptions, boundParams); else -#endif +#endif /* XCP */ +#endif /* PGXC */ result = standard_planner(parse, cursorOptions, boundParams); return result; } @@ -153,6 +169,12 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams) ListCell *lp, *lr; +#ifdef XCP + if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && parse->utilityStmt && + IsA(parse->utilityStmt, RemoteQuery)) + return pgxc_direct_planner(parse, cursorOptions, boundParams); +#endif + /* Cursor options may come from caller or from DECLARE CURSOR stmt */ if (parse->utilityStmt && IsA(parse->utilityStmt, DeclareCursorStmt)) @@ -211,6 +233,14 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams) /* primary planning entry point (may recurse for subqueries) */ top_plan = subquery_planner(glob, parse, NULL, false, tuple_fraction, &root); +#ifdef XCP + if (root->distribution) + { + top_plan = (Plan *) make_remotesubplan(root, top_plan, NULL, + root->distribution, + root->query_pathkeys); + } +#endif /* * If creating a plan for a scrollable cursor, make sure it can run @@ -237,6 +267,35 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams) lfirst(lp) = set_plan_references(subroot, subplan); } +#ifdef PGXC +#ifndef XCP + /* + * PGXC should apply INSERT/UPDATE/DELETE to a Datanode. We are overriding + * normal Postgres behavior by modifying final plan or by adding a node on + * top of it. + * If the optimizer finds out that there is nothing to UPDATE/INSERT/DELETE + * in the table/s (say using constraint exclusion), it does not add modify + * table plan on the top. We should send queries to the remote nodes only + * when there is something to modify. + */ + if (IS_PGXC_COORDINATOR && IsA(top_plan, ModifyTable)) + switch (parse->commandType) + { + case CMD_INSERT: + top_plan = create_remoteinsert_plan(root, top_plan); + break; + case CMD_UPDATE: + top_plan = create_remoteupdate_plan(root, top_plan); + break; + case CMD_DELETE: + top_plan = create_remotedelete_plan(root, top_plan); + break; + default: + break; + } +#endif /* XCP */ +#endif + /* build the PlannedStmt result */ result = makeNode(PlannedStmt); @@ -256,6 +315,11 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams) result->relationOids = glob->relationOids; result->invalItems = glob->invalItems; result->nParamExec = list_length(glob->paramlist); +#ifdef XCP + result->distributionType = LOCATOR_TYPE_NONE; + result->distributionKey = InvalidAttrNumber; + result->distributionNodes = NULL; +#endif return result; } @@ -316,8 +380,10 @@ subquery_planner(PlannerGlobal *glob, Query *parse, root->hasInheritedTarget = false; #ifdef PGXC +#ifndef XCP root->rs_alias_index = 1; -#endif +#endif /* XCP */ +#endif /* PGXC */ root->hasRecursion = hasRecursion; if (hasRecursion) root->wt_param_id = SS_assign_special_param(root); @@ -397,6 +463,7 @@ subquery_planner(PlannerGlobal *glob, Query *parse, preprocess_rowmarks(root); #ifdef PGXC +#ifndef XCP /* * In Coordinators we separate row marks in two groups * one comprises of row marks of types ROW_MARK_EXCLUSIVE & ROW_MARK_SHARE @@ -415,6 +482,7 @@ subquery_planner(PlannerGlobal *glob, Query *parse, */ separate_rowmarks(root); #endif +#endif /* * Expand any rangetable entries that are inheritance sets into "append * relations". This can add entries to the rangetable, but they must be @@ -584,6 +652,13 @@ subquery_planner(PlannerGlobal *glob, Query *parse, else rowMarks = root->rowMarks; +#ifdef XCP + if (root->query_level > 1) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("INSERT/UPDATE/DELETE is not supported in subquery"))); +#endif + plan = (Plan *) make_modifytable(parse->commandType, parse->canSetTag, list_make1_int(parse->resultRelation), @@ -591,9 +666,6 @@ subquery_planner(PlannerGlobal *glob, Query *parse, returningLists, rowMarks, SS_assign_special_param(root)); -#ifdef PGXC - plan = pgxc_make_modifytable(root, plan); -#endif } } @@ -610,6 +682,57 @@ subquery_planner(PlannerGlobal *glob, Query *parse, if (subroot) *subroot = root; + /* + * XCPTODO + * Temporarily block WITH RECURSIVE for most cases + * until we can fix. Allow for pg_catalog tables and replicated tables. + */ + if (root->hasRecursion) + { + int idx; + bool recursiveOk = true; + + /* seems to start at 1... */ + for (idx = 1; idx < root->simple_rel_array_size - 1; idx++) + { + RangeTblEntry *rte; + + rte = root->simple_rte_array[idx]; + + if (!rte || rte->rtekind == RTE_JOIN) + { + continue; + } + else if (rte->rtekind == RTE_RELATION) + { + char loc_type; + + loc_type = GetRelationLocType(rte->relid); + + /* skip pg_catalog */ + if (loc_type == LOCATOR_TYPE_NONE) + continue; + + /* If replicated, allow */ + if (IsLocatorReplicated(loc_type)) + { + continue; + } + else + { + recursiveOk = false; + break; + } + } + else + { + recursiveOk = false; + break; + } + } + if (!recursiveOk) + elog(ERROR, "WITH RECURSIVE currently not supported on distributed tables."); + } return plan; } @@ -761,9 +884,6 @@ inheritance_planner(PlannerInfo *root) List *returningLists = NIL; List *rowMarks; ListCell *lc; -#ifdef PGXC - ModifyTable *mtplan; -#endif /* * We generate a modified instance of the original Query for each target @@ -882,6 +1002,39 @@ inheritance_planner(PlannerInfo *root) if (is_dummy_plan(subplan)) continue; +#ifdef XCP + /* + * All subplans should have the same distribution, except may be + * restriction. At the moment this is always the case but if this + * is changed we should handle inheritance differently. + * Effectively we want to push the modify table down to data nodes, if + * it is running against distributed inherited tables. To achieve this + * we are building up distribution of the query from distributions of + * the subplans. + * If subplans are restricted to different nodes we should union these + * restrictions, if at least one subplan is not restricted we should + * not restrict parent plan. + * After returning a plan from the function valid root->distribution + * value will force proper RemoteSubplan node on top of it. + */ + if (root->distribution == NULL) + root->distribution = subroot.distribution; + else if (!bms_is_empty(root->distribution->restrictNodes)) + { + if (bms_is_empty(subroot.distribution->restrictNodes)) + { + bms_free(root->distribution->restrictNodes); + root->distribution->restrictNodes = NULL; + } + else + { + root->distribution->restrictNodes = bms_join( + root->distribution->restrictNodes, + subroot.distribution->restrictNodes); + subroot.distribution->restrictNodes = NULL; + } + } +#endif subplans = lappend(subplans, subplan); /* @@ -964,20 +1117,13 @@ inheritance_planner(PlannerInfo *root) rowMarks = root->rowMarks; /* And last, tack on a ModifyTable node to do the UPDATE/DELETE work */ -#ifdef PGXC - mtplan = make_modifytable(parse->commandType, -#else return (Plan *) make_modifytable(parse->commandType, -#endif parse->canSetTag, resultRelations, subplans, returningLists, rowMarks, SS_assign_special_param(root)); -#ifdef PGXC - return pgxc_make_modifytable(root, (Plan *)mtplan); -#endif } /*-------------------- @@ -1012,6 +1158,9 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) double dNumGroups = 0; bool use_hashed_distinct = false; bool tested_hashed_distinct = false; +#ifdef XCP + Distribution *distribution = NULL; /* distribution of the result_plan */ +#endif /* Tweak caller-supplied tuple_fraction if have LIMIT/OFFSET */ if (parse->limitCount || parse->limitOffset) @@ -1361,6 +1510,9 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) result_plan = create_plan(root, best_path); current_pathkeys = best_path->pathkeys; +#ifdef XCP + distribution = best_path->distribution; +#endif /* Detect if we'll need an explicit sort for grouping */ if (parse->groupClause && !use_hashed_grouping && @@ -1402,16 +1554,15 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) * the desired tlist. */ result_plan->targetlist = sub_tlist; -#ifdef PGXC - /* - * If the Join tree is completely shippable, adjust the - * target list of the query according to the new targetlist - * set above. For now do this only for SELECT statements. - */ - if (IsA(result_plan, RemoteQuery) && parse->commandType == CMD_SELECT) - pgxc_rqplan_adjust_tlist((RemoteQuery *)result_plan); -#endif /* PGXC */ } +#ifdef XCP + /* + * RemoteSubplan is conditionally projection capable - it is + * pushing projection to the data nodes + */ + if (IsA(result_plan, RemoteSubplan)) + result_plan->lefttree->targetlist = sub_tlist; +#endif /* * Also, account for the cost of evaluation of the sub_tlist. @@ -1456,6 +1607,12 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) */ if (use_hashed_grouping) { +#ifdef XCP + result_plan = grouping_distribution(root, result_plan, + numGroupCols, groupColIdx, + current_pathkeys, + &distribution); +#endif /* Hashed aggregate plan --- no sort needed */ result_plan = (Plan *) make_agg(root, tlist, @@ -1467,6 +1624,18 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) extract_grouping_ops(parse->groupClause), numGroups, result_plan); +#ifdef PGXC +#ifndef XCP + /* + * Grouping will certainly not increase the number of rows + * coordinator fetches from datanode, in fact it's expected to + * reduce the number drastically. Hence, try pushing GROUP BY + * clauses and aggregates to the datanode, thus saving bandwidth. + */ + if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + result_plan = create_remoteagg_plan(root, result_plan); +#endif /* XCP */ +#endif /* PGXC */ /* Hashed aggregation produces randomly-ordered results */ current_pathkeys = NIL; } @@ -1500,6 +1669,12 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) current_pathkeys = NIL; } +#ifdef XCP + result_plan = grouping_distribution(root, result_plan, + numGroupCols, groupColIdx, + current_pathkeys, + &distribution); +#endif result_plan = (Plan *) make_agg(root, tlist, (List *) parse->havingQual, @@ -1530,6 +1705,12 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) current_pathkeys = root->group_pathkeys; } +#ifdef XCP + result_plan = grouping_distribution(root, result_plan, + numGroupCols, groupColIdx, + current_pathkeys, + &distribution); +#endif result_plan = (Plan *) make_group(root, tlist, (List *) parse->havingQual, @@ -1553,12 +1734,18 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) * this routine to avoid having to generate the plan in the * first place. */ +#ifdef XCP + result_plan = grouping_distribution(root, result_plan, 0, NULL, + current_pathkeys, + &distribution); +#endif result_plan = (Plan *) make_result(root, tlist, parse->havingQual, NULL); } #ifdef PGXC +#ifndef XCP /* * Grouping will certainly not increase the number of rows * Coordinator fetches from Datanode, in fact it's expected to @@ -1567,6 +1754,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) result_plan = create_remotegrouping_plan(root, result_plan); +#endif /* XCP */ #endif /* PGXC */ } /* end of non-minmax-aggregate case */ @@ -1626,6 +1814,21 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) window_tlist = add_volatile_sort_exprs(window_tlist, tlist, activeWindows); result_plan->targetlist = (List *) copyObject(window_tlist); +#ifdef XCP + /* + * We can not guarantee correct result of windowing function + * if aggregation is pushed down to Datanodes. So if current plan + * produces a distributed result set we should bring it to + * coordinator. + */ + if (distribution) + { + result_plan = (Plan *) + make_remotesubplan(root, result_plan, NULL, + distribution, current_pathkeys); + distribution = NULL; + } +#endif foreach(l, activeWindows) { @@ -1667,6 +1870,30 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) result_plan = (Plan *) sort_plan; current_pathkeys = window_pathkeys; } +#ifdef XCP + /* + * In our code, Sort may be pushed down to the Datanodes, + * and therefore we may get the sort_plan is not really a + * Sort node. In this case we should get sort columns from + * the top RemoteSubplan + */ + if (!IsA(sort_plan, Sort)) + { + RemoteSubplan *pushdown; + pushdown = find_push_down_plan(sort_plan, true); + Assert(pushdown && pushdown->sort); + get_column_info_for_window(root, wc, tlist, + pushdown->sort->numCols, + pushdown->sort->sortColIdx, + &partNumCols, + &partColIdx, + &partOperators, + &ordNumCols, + &ordColIdx, + &ordOperators); + } + else +#endif /* In either case, extract the per-column information */ get_column_info_for_window(root, wc, tlist, sort_plan->numCols, @@ -1766,6 +1993,14 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) if (use_hashed_distinct) { +#ifdef XCP + result_plan = grouping_distribution(root, result_plan, + list_length(parse->distinctClause), + extract_grouping_cols(parse->distinctClause, + result_plan->targetlist), + current_pathkeys, + &distribution); +#endif /* Hashed aggregate plan --- no sort needed */ result_plan = (Plan *) make_agg(root, result_plan->targetlist, @@ -1822,6 +2057,14 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) -1.0); } +#ifdef XCP + result_plan = grouping_distribution(root, result_plan, + list_length(parse->distinctClause), + extract_grouping_cols(parse->distinctClause, + result_plan->targetlist), + current_pathkeys, + &distribution); +#endif result_plan = (Plan *) make_unique(result_plan, parse->distinctClause); result_plan->plan_rows = dNumDistinctRows; @@ -1841,11 +2084,6 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) result_plan, root->sort_pathkeys, limit_tuples); -#ifdef PGXC - if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) - result_plan = (Plan *) create_remotesort_plan(root, - result_plan); -#endif /* PGXC */ current_pathkeys = root->sort_pathkeys; } } @@ -1874,16 +2112,21 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) */ if (parse->limitCount || parse->limitOffset) { +#ifdef XCP + /* We should put Limit on top of distributed results */ + if (distribution) + { + result_plan = (Plan *) + make_remotesubplan(root, result_plan, NULL, + distribution, current_pathkeys); + distribution = NULL; + } +#endif result_plan = (Plan *) make_limit(result_plan, parse->limitOffset, parse->limitCount, offset_est, count_est); -#ifdef PGXC - /* See if we can push LIMIT or OFFSET clauses to Datanodes */ - if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) - result_plan = (Plan *) create_remotelimit_plan(root, result_plan); -#endif /* PGXC */ } /* @@ -1892,6 +2135,155 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) */ root->query_pathkeys = current_pathkeys; +#ifdef XCP + /* + * Adjust query distribution if requested + */ + if (root->distribution) + { + if (equal_distributions(root, root->distribution, distribution)) + { + if (IsLocatorReplicated(distribution->distributionType) && + contain_volatile_functions((Node *) result_plan->targetlist)) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + errmsg("can not update replicated table with result of volatile function"))); + /* + * Source tuple will be consumed on the same node where it is + * produced, so if it is known that some node does not yield tuples + * we do not want to send subquery for execution on these nodes + * at all. + * So copy the restriction to the external distribution. + * XXX Is that ever possible if external restriction is already + * defined? If yes we probably should use intersection of the sets, + * and if resulting set is empty create dummy plan and set it as + * the result_plan. Need to think this over + */ + root->distribution->restrictNodes = + bms_copy(distribution->restrictNodes); + } + else + { + RemoteSubplan *distributePlan; + /* + * If the planned statement is either UPDATE or DELETE different + * distributions here mean the ModifyTable node will be placed on + * top of RemoteSubquery. UPDATE and DELETE versions of ModifyTable + * use TID of incoming tuple to apply the changes, but the + * RemoteSubquery node supplies RemoteTuples, without such field. + * Therefore we can not execute such plan. + * Most common case is when UPDATE statement modifies the + * distribution column. Also incorrect distributed plan is possible + * if planning a complex UPDATE or DELETE statement involving table + * join. + * We output different error messages in UPDATE and DELETE cases + * mostly for compatibility with PostgresXC. It is hard to determine + * here, if such plan is because updated partitioning key or poorly + * planned join, so in case of UPDATE we assume the first case as + * more probable, for DELETE the second case is only possible. + * The error message may be misleading, if that is UPDATE and join, + * but hope we will target distributed update problem soon. + * There are two ways of fixing that: + * 1. Improve distribution planner to never consider to redistribute + * target table. So if planner finds that it has no choice, it would + * throw error somewhere else. So here we only be catching cases of + * updating distribution columns. + * 2. Modify executor and allow distribution column updates. However + * there are a lot of issues behind the scene when implementing that + * approach. + */ + if (parse->commandType == CMD_UPDATE) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + errmsg("could not plan this distributed update"), + errdetail("correlated UPDATE or updating distribution column currently not supported in Postgres-XL."))); + if (parse->commandType == CMD_DELETE) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + errmsg("could not plan this distributed delete"), + errdetail("correlated or complex DELETE is currently not supported in Postgres-XL."))); + + /* + * Redistribute result according to requested distribution. + */ + if ((distributePlan = find_push_down_plan(result_plan, true))) + { + Bitmapset *tmpset; + int nodenum; + + distributePlan->distributionType = root->distribution->distributionType; + distributePlan->distributionKey = InvalidAttrNumber; + if (root->distribution->distributionExpr) + { + ListCell *lc; + + /* Find distribution expression in the target list */ + foreach(lc, distributePlan->scan.plan.targetlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(lc); + + if (equal(tle->expr, root->distribution->distributionExpr)) + { + distributePlan->distributionKey = tle->resno; + break; + } + } + + if (distributePlan->distributionKey == InvalidAttrNumber) + { + Plan *lefttree = distributePlan->scan.plan.lefttree; + Plan *plan; + TargetEntry *newtle; + + /* The expression is not found, need to add junk */ + newtle = makeTargetEntry((Expr *) root->distribution->distributionExpr, + list_length(lefttree->targetlist) + 1, + NULL, + true); + + if (is_projection_capable_plan(lefttree)) + { + /* Ok to modify subplan's target list */ + lefttree->targetlist = lappend(lefttree->targetlist, + newtle); + } + else + { + /* Use Result node to calculate expression */ + List *newtlist = list_copy(lefttree->targetlist); + newtlist = lappend(newtlist, newtle); + lefttree = (Plan *) make_result(root, newtlist, NULL, lefttree); + distributePlan->scan.plan.lefttree = lefttree; + } + /* Update all the hierarchy */ + for (plan = result_plan; plan != lefttree; plan = plan->lefttree) + plan->targetlist = lefttree->targetlist; + } + } + tmpset = bms_copy(root->distribution->nodes); + distributePlan->distributionNodes = NIL; + while ((nodenum = bms_first_member(tmpset)) >= 0) + distributePlan->distributionNodes = lappend_int( + distributePlan->distributionNodes, nodenum); + bms_free(tmpset); + } + else + result_plan = (Plan *) make_remotesubplan(root, + result_plan, + root->distribution, + distribution, + NULL); + } + } + else + { + /* + * Inform caller about distribution of the subplan + */ + root->distribution = distribution; + } +#endif + return result_plan; } @@ -2086,6 +2478,7 @@ preprocess_rowmarks(PlannerInfo *root) } #ifdef PGXC +#ifndef XCP /* * separate_rowmarks - In XC Coordinators are supposed to skip handling * of type ROW_MARK_EXCLUSIVE & ROW_MARK_SHARE. @@ -2120,7 +2513,7 @@ separate_rowmarks(PlannerInfo *root) root->rowMarks = rml_2; root->xc_rowMarks = rml_1; } - +#endif /*XCP*/ #endif /*PGXC*/ /* @@ -3400,3 +3793,85 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid) return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost); } + + +#ifdef XCP +/* + * Grouping preserves distribution if distribution key is the + * first grouping key or if distribution is replicated. + * In these cases aggregation is fully pushed down to nodes. + * Otherwise we need 2-phase aggregation so put remote subplan + * on top of the result_plan. When adding result agg on top of + * RemoteSubplan first aggregation phase will be pushed down + * automatically. + */ +static Plan * +grouping_distribution(PlannerInfo *root, Plan *plan, + int numGroupCols, AttrNumber *groupColIdx, + List *current_pathkeys, Distribution **distribution) +{ + if (*distribution && + !IsLocatorReplicated((*distribution)->distributionType) && + (numGroupCols == 0 || + (*distribution)->distributionExpr == NULL || + !equal(((TargetEntry *)list_nth(plan->targetlist, groupColIdx[0]-1))->expr, + (*distribution)->distributionExpr))) + { + Plan *result_plan; + result_plan = (Plan *) make_remotesubplan(root, plan, NULL, + *distribution, + current_pathkeys); + *distribution = NULL; + return result_plan; + } + return plan; +} + + +/* + * Check if two distributions are equal. + * Distributions are considered equal if they are of the same type, on the same + * nodes and if they have distribution expressions defined they are equal + * (either the same expressions or they are member of the same equivalence + * class) + */ +static bool +equal_distributions(PlannerInfo *root, Distribution *dst1, + Distribution *dst2) +{ + /* fast path */ + if (dst1 == dst2) + return true; + if (dst1 == NULL || dst2 == NULL) + return false; + + /* Conditions that easier to check go first */ + if (dst1->distributionType != dst2->distributionType) + return false; + + if (!bms_equal(dst1->nodes, dst2->nodes)) + return false; + + if (equal(dst1->distributionExpr, dst2->distributionExpr)) + return true; + + /* + * For more thorough expression check we need to ensure they both are + * defined + */ + if (dst1->distributionExpr == NULL || dst2->distributionExpr == NULL) + return false; + + /* + * More thorough check, but allows some important cases, like if + * distribution column is not updated (implicit set distcol=distcol) or + * set distcol = CONST, ... WHERE distcol = CONST - pattern used by many + * applications + */ + if (exprs_known_equal(root, dst1->distributionExpr, dst2->distributionExpr)) + return true; + + /* The restrictNodes field does not matter for distribution equality */ + return false; +} +#endif diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 5187d27ae9..691b6d0909 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -4,6 +4,11 @@ * Post-processing of a completed plan tree: fix references to subplan * vars, compute regproc values for operators, etc * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -26,8 +31,7 @@ #include "utils/lsyscache.h" #include "utils/syscache.h" #ifdef PGXC -#include "pgxc/pgxc.h" -#include "optimizer/pgxcplan.h" +#include "pgxc/planner.h" #endif @@ -69,6 +73,9 @@ typedef struct indexed_tlist *subplan_itlist; Index newvarno; int rtoffset; +#ifdef XCP + bool agg_master; +#endif } fix_upper_expr_context; typedef struct @@ -77,7 +84,6 @@ typedef struct indexed_tlist *base_itlist; int rtoffset; Index relid; - bool return_non_base_vars; /* Should we reject or return vars not found in base_itlist */ } fix_remote_expr_context; /* @@ -127,11 +133,20 @@ static List *fix_join_expr(PlannerInfo *root, Index acceptable_rel, int rtoffset); static Node *fix_join_expr_mutator(Node *node, fix_join_expr_context *context); +#ifdef XCP +static Node *fix_upper_expr(PlannerInfo *root, + Node *node, + indexed_tlist *subplan_itlist, + Index newvarno, + int rtoffset, + bool agg_master); +#else static Node *fix_upper_expr(PlannerInfo *root, Node *node, indexed_tlist *subplan_itlist, Index newvarno, int rtoffset); +#endif static Node *fix_upper_expr_mutator(Node *node, fix_upper_expr_context *context); static List *set_returning_clause_references(PlannerInfo *root, @@ -144,18 +159,20 @@ static bool extract_query_dependencies_walker(Node *node, PlannerInfo *context); #ifdef PGXC +#ifndef XCP /* References for remote plans */ static List * fix_remote_expr(PlannerInfo *root, List *clauses, indexed_tlist *base_itlist, Index newrelid, - int rtoffset, - bool return_non_base_vars); + int rtoffset); static Node *fix_remote_expr_mutator(Node *node, fix_remote_expr_context *context); static void set_remote_references(PlannerInfo *root, RemoteQuery *rscan, int rtoffset); -static void pgxc_set_agg_references(PlannerInfo *root, Agg *aggplan); -static List *set_remote_returning_refs(PlannerInfo *root, List *rlist, Plan *topplan, Index relid, int rtoffset); +#endif +#endif +#ifdef XCP +static void set_remotesubplan_references(PlannerInfo *root, Plan *plan, int rtoffset); #endif @@ -454,6 +471,7 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) } break; #ifdef PGXC +#ifndef XCP case T_RemoteQuery: { RemoteQuery *splan = (RemoteQuery *) plan; @@ -474,6 +492,7 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) } break; #endif +#endif case T_ForeignScan: { ForeignScan *splan = (ForeignScan *) plan; @@ -487,6 +506,11 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) fix_scan_list(root, splan->fdw_exprs, rtoffset); } break; +#ifdef XCP + case T_RemoteSubplan: + set_remotesubplan_references(root, plan, rtoffset); + break; +#endif /* XCP */ case T_NestLoop: case T_MergeJoin: case T_HashJoin: @@ -555,11 +579,6 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) } break; case T_Agg: -#ifdef PGXC - /* If the lower plan is RemoteQuery plan, adjust the aggregates */ - pgxc_set_agg_references(root, (Agg *)plan); - /* Fall through */ -#endif /* PGXC */ case T_Group: set_upper_references(root, plan, rtoffset); break; @@ -605,12 +624,6 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) case T_ModifyTable: { ModifyTable *splan = (ModifyTable *) plan; -#ifdef PGXC - int n = 0; - List *firstRetList; /* First returning list required for - * setting up visible plan target list - */ -#endif Assert(splan->plan.targetlist == NIL); Assert(splan->plan.qual == NIL); @@ -635,48 +648,7 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) List *rlist = (List *) lfirst(lcrl); Index resultrel = lfirst_int(lcrr); Plan *subplan = (Plan *) lfirst(lcp); -#ifdef PGXC - RemoteQuery *rq = NULL; - - if (n == 0) - { - /* - * Set up first returning list before we change - * var references to point to RTE_REMOTE_DUMMY - */ - firstRetList = set_returning_clause_references(root, - rlist, - subplan, - resultrel, - rtoffset); - /* Restore the returning list changed by the above call */ - rlist = (List *) lfirst(lcrl); - } - - if (splan->remote_plans) - rq = (RemoteQuery *)list_nth(splan->remote_plans, n); - n++; - - if(rq != NULL && IS_PGXC_COORDINATOR && !IsConnFromCoord()) - { - /* - * Set references of returning clause by adjusting - * varno/varattno according to target list in - * remote query node - */ - rlist = set_remote_returning_refs(root, - rlist, - (Plan *)rq, - rq->scan.scanrelid, - rtoffset); - /* - * The next call to set_returning_clause_references - * should skip the vars already taken care of by - * the above call to set_remote_returning_refs - */ - resultrel = rq->scan.scanrelid; - } -#endif + rlist = set_returning_clause_references(root, rlist, subplan, @@ -686,16 +658,6 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) } splan->returningLists = newRL; -#ifdef PGXC - /* - * In XC we do not need to set the target list as the - * first RETURNING list from the finalized list because - * it can contain vars referring to RTE_REMOTE_DUMMY. - * We therefore create a list before fixing - * remote returning references and use that here. - */ - splan->plan.targetlist = copyObject(firstRetList); -#else /* * Set up the visible plan targetlist as being the same as * the first RETURNING list. This is for the use of @@ -705,7 +667,6 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) * twice on identical targetlists. */ splan->plan.targetlist = copyObject(linitial(newRL)); -#endif } foreach(l, splan->resultRelations) @@ -736,34 +697,6 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) root->glob->resultRelations = list_concat(root->glob->resultRelations, list_copy(splan->resultRelations)); - -#ifdef PGXC - /* Adjust references of remote query nodes in ModifyTable node */ - if(IS_PGXC_COORDINATOR && !IsConnFromCoord()) - { - ListCell *elt; - RemoteQuery *rq; - - foreach(elt, splan->remote_plans) - { - rq = (RemoteQuery *) lfirst(elt); - /* - * If base_tlist is set, it means that we have a reduced remote - * query plan. So need to set the var references accordingly. - */ - if (rq->base_tlist) - set_remote_references(root, rq, rtoffset); - rq->scan.plan.targetlist = fix_scan_list(root, - rq->scan.plan.targetlist, - rtoffset); - rq->scan.plan.qual = fix_scan_list(root, - rq->scan.plan.qual, - rtoffset); - rq->base_tlist = fix_scan_list(root, rq->base_tlist, rtoffset); - rq->scan.scanrelid += rtoffset; - } - } -#endif } break; case T_Append: @@ -876,6 +809,22 @@ set_indexonlyscan_references(PlannerInfo *root, index_itlist = build_tlist_index(plan->indextlist); plan->scan.scanrelid += rtoffset; +#ifdef XCP + plan->scan.plan.targetlist = (List *) + fix_upper_expr(root, + (Node *) plan->scan.plan.targetlist, + index_itlist, + INDEX_VAR, + rtoffset, + false); + plan->scan.plan.qual = (List *) + fix_upper_expr(root, + (Node *) plan->scan.plan.qual, + index_itlist, + INDEX_VAR, + rtoffset, + false); +#else plan->scan.plan.targetlist = (List *) fix_upper_expr(root, (Node *) plan->scan.plan.targetlist, @@ -888,6 +837,7 @@ set_indexonlyscan_references(PlannerInfo *root, index_itlist, INDEX_VAR, rtoffset); +#endif /* indexqual is already transformed to reference index columns */ plan->indexqual = fix_scan_list(root, plan->indexqual, rtoffset); /* indexorderby is already transformed to reference index columns */ @@ -1260,11 +1210,20 @@ set_join_references(PlannerInfo *root, Join *join, int rtoffset) { NestLoopParam *nlp = (NestLoopParam *) lfirst(lc); +#ifdef XCP + nlp->paramval = (Var *) fix_upper_expr(root, + (Node *) nlp->paramval, + outer_itlist, + OUTER_VAR, + rtoffset, + false); +#else nlp->paramval = (Var *) fix_upper_expr(root, (Node *) nlp->paramval, outer_itlist, OUTER_VAR, rtoffset); +#endif /* Check we replaced any PlaceHolderVar with simple Var */ if (!(IsA(nlp->paramval, Var) && nlp->paramval->varno == OUTER_VAR)) @@ -1323,6 +1282,12 @@ set_upper_references(PlannerInfo *root, Plan *plan, int rtoffset) indexed_tlist *subplan_itlist; List *output_targetlist; ListCell *l; +#ifdef XCP + bool agg_master; + + agg_master = (IsA(plan, Agg) && + ((Agg *) plan)->aggdistribution == AGG_MASTER); +#endif subplan_itlist = build_tlist_index(subplan->targetlist); @@ -1341,18 +1306,36 @@ set_upper_references(PlannerInfo *root, Plan *plan, int rtoffset) subplan_itlist, OUTER_VAR); if (!newexpr) +#ifdef XCP + newexpr = fix_upper_expr(root, + (Node *) tle->expr, + subplan_itlist, + OUTER_VAR, + rtoffset, + agg_master); +#else newexpr = fix_upper_expr(root, (Node *) tle->expr, subplan_itlist, OUTER_VAR, rtoffset); +#endif } else +#ifdef XCP + newexpr = fix_upper_expr(root, + (Node *) tle->expr, + subplan_itlist, + OUTER_VAR, + rtoffset, + agg_master); +#else newexpr = fix_upper_expr(root, (Node *) tle->expr, subplan_itlist, OUTER_VAR, rtoffset); +#endif tle = flatCopyTargetEntry(tle); tle->expr = (Expr *) newexpr; output_targetlist = lappend(output_targetlist, tle); @@ -1360,12 +1343,20 @@ set_upper_references(PlannerInfo *root, Plan *plan, int rtoffset) plan->targetlist = output_targetlist; plan->qual = (List *) +#ifdef XCP + fix_upper_expr(root, + (Node *) plan->qual, + subplan_itlist, + OUTER_VAR, + rtoffset, + agg_master); +#else fix_upper_expr(root, (Node *) plan->qual, subplan_itlist, OUTER_VAR, rtoffset); - +#endif pfree(subplan_itlist); } @@ -1594,6 +1585,34 @@ search_indexed_tlist_for_non_var(Node *node, return NULL; /* no match */ } +#ifdef PGXC +#ifndef XCP +/* + * search_tlist_for_var --- find a Var in the provided tlist. This does a + * basic scan through the list. So not very efficient... + * + * If no match, return NULL. + * + */ +Var * +search_tlist_for_var(Var *var, List *jtlist) +{ + Index varno = var->varno; + AttrNumber varattno = var->varattno; + ListCell *l; + + foreach(l, jtlist) + { + Var *listvar = (Var *) lfirst(l); + + if (listvar->varno == varno && listvar->varattno == varattno) + return var; + } + return NULL; /* no match */ +} +#endif +#endif + /* * search_indexed_tlist_for_sortgroupref --- find a sort/group expression * (which is assumed not to be just a Var) @@ -1798,12 +1817,22 @@ fix_join_expr_mutator(Node *node, fix_join_expr_context *context) * varno = newvarno, varattno = resno of corresponding targetlist element. * The original tree is not modified. */ +#ifdef XCP +static Node * +fix_upper_expr(PlannerInfo *root, + Node *node, + indexed_tlist *subplan_itlist, + Index newvarno, + int rtoffset, + bool agg_master) +#else static Node * fix_upper_expr(PlannerInfo *root, Node *node, indexed_tlist *subplan_itlist, Index newvarno, int rtoffset) +#endif { fix_upper_expr_context context; @@ -1811,6 +1840,9 @@ fix_upper_expr(PlannerInfo *root, context.subplan_itlist = subplan_itlist; context.newvarno = newvarno; context.rtoffset = rtoffset; +#ifdef XCP + context.agg_master = agg_master; +#endif return fix_upper_expr_mutator(node, &context); } @@ -1855,6 +1887,16 @@ fix_upper_expr_mutator(Node *node, fix_upper_expr_context *context) newvar = search_indexed_tlist_for_non_var(node, context->subplan_itlist, context->newvarno); +#ifdef XCP + if (newvar && context->agg_master && IsA(node, Aggref)) + { + TargetEntry *newtle; + Aggref *newnode = copyObject(node); + newtle = makeTargetEntry((Expr *) newvar, 1, NULL, false); + newnode->args = list_make1(newtle); + return (Node *) newnode; + } +#endif if (newvar) return (Node *) newvar; } @@ -2104,6 +2146,10 @@ extract_query_dependencies_walker(Node *node, PlannerInfo *context) return expression_tree_walker(node, extract_query_dependencies_walker, (void *) context); } + + +#ifdef PGXC +#ifndef XCP /* * fix_remote_expr * Create a new set of targetlist entries or qual clauses by @@ -2115,9 +2161,6 @@ extract_query_dependencies_walker(Node *node, PlannerInfo *context) * 'clauses' is the targetlist or list of clauses * 'base_itlist' is the indexed target list of the base referenced relations * - * 'return_non_base_vars' lets the caller decide whether to reject - * or return vars not found in base_itlist - * * Returns the new expression tree. The original clause structure is * not modified. */ @@ -2126,8 +2169,7 @@ fix_remote_expr(PlannerInfo *root, List *clauses, indexed_tlist *base_itlist, Index newrelid, - int rtoffset, - bool return_non_base_vars) + int rtoffset) { fix_remote_expr_context context; @@ -2135,7 +2177,6 @@ fix_remote_expr(PlannerInfo *root, context.base_itlist = base_itlist; context.relid = newrelid; context.rtoffset = rtoffset; - context.return_non_base_vars = return_non_base_vars; return (List *) fix_remote_expr_mutator((Node *) clauses, &context); } @@ -2160,10 +2201,6 @@ fix_remote_expr_mutator(Node *node, fix_remote_expr_context *context) if (newvar) return (Node *) newvar; - /* If it's not found in base_itlist, return it if required */ - if (context->return_non_base_vars && var->varno != context->relid) - return (Node *) var; - /* No reference found for Var */ elog(ERROR, "variable not found in base remote scan target lists"); } @@ -2204,172 +2241,93 @@ set_remote_references(PlannerInfo *root, RemoteQuery *rscan, int rtoffset) rscan->scan.plan.targetlist, base_itlist, rscan->scan.scanrelid, - rtoffset, - false); + rtoffset); rscan->scan.plan.qual = fix_remote_expr(root , rscan->scan.plan.qual, base_itlist, rscan->scan.scanrelid, - rtoffset, - false); + rtoffset); pfree(base_itlist); } -/* - * set_remote_returning_refs - * - * Fix references of remote returning list to point - * to reference target list values from the base - * relation target lists - */ - -static List * -set_remote_returning_refs(PlannerInfo *root, - List *rlist, - Plan *topplan, - Index relid, - int rtoffset) +Node * +pgxc_fix_scan_expr(PlannerInfo *root, Node *node, int rtoffset) { - indexed_tlist *base_itlist; - - base_itlist = build_tlist_index(topplan->targetlist); - - rlist = fix_remote_expr(root, - rlist, - base_itlist, - relid, - rtoffset, - true); - - pfree(base_itlist); - - return rlist; + return fix_scan_expr(root, node, rtoffset); } +#endif /* XCP */ +#endif /* PGXC */ -#ifdef PGXC + +#ifdef XCP /* - * For Agg plans, if the lower scan plan is a RemoteQuery node, adjust the - * Aggref nodes to pull the transition results from the datanodes. We do while - * setting planner references so that the upper nodes will find the nodes that - * they expect in Agg plans. + * set_remotesubplan_references + * Usually RemoteSubplan node does just translates its target list, so it is + * enought to invoke fix_scan_list here. One exception is if the + * RemoteSubplan is set on top of ModifyTable. In this case target lists of both + * these plan nodes are NIL. If the subplan is not returning we want to leave + * target list NIL, if yes, we should make up target list as a list of simple + * references to entries from the first returning list. + * The qual of RemoteSubplan is always NULL. */ -void -pgxc_set_agg_references(PlannerInfo *root, Agg *aggplan) +static void +set_remotesubplan_references(PlannerInfo *root, Plan *plan, int rtoffset) { - RemoteQuery *rqplan = (RemoteQuery *)aggplan->plan.lefttree; - Sort *srtplan; - List *aggs_n_vars; - ListCell *lcell; - List *nodes_to_modify; - List *rq_nodes_to_modify; - List *srt_nodes_to_modify; - - /* Lower plan tree can be Sort->RemoteQuery or RemoteQuery */ - if (IsA(rqplan, Sort)) + if (plan->targetlist == NIL) { - srtplan = (Sort *)rqplan; - rqplan = (RemoteQuery *)srtplan->plan.lefttree; - } - else - srtplan = NULL; - - if (!IsA(rqplan, RemoteQuery)) - return; - - Assert(IS_PGXC_COORDINATOR && !IsConnFromCoord()); - /* - * If there are not transition results expected from lower plans, nothing to - * be done here. - */ - if (!aggplan->skip_trans) - return; - - /* Gather all the aggregates from all the targetlists that need fixing */ - nodes_to_modify = list_copy(aggplan->plan.targetlist); - nodes_to_modify = list_concat(nodes_to_modify, aggplan->plan.qual); - aggs_n_vars = pull_var_clause((Node *)nodes_to_modify, PVC_INCLUDE_AGGREGATES, - PVC_RECURSE_PLACEHOLDERS); - rq_nodes_to_modify = NIL; - srt_nodes_to_modify = NIL; - /* - * For every aggregate, find corresponding aggregate in the lower plan and - * modify it correctly. - */ - foreach (lcell, aggs_n_vars) - { - Aggref *aggref = lfirst(lcell); - TargetEntry *tle; - Aggref *rq_aggref; - Aggref *srt_aggref; - Aggref *arg_aggref; /* Aggref to be set as Argument to the - * aggref in the Agg plan */ - - /* Only Aggref expressions need modifications */ - if (!IsA(aggref, Aggref)) + ModifyTable *mt = (ModifyTable *) plan->lefttree; + if (IsA(mt, ModifyTable) && mt->returningLists) { - Assert(IsA(aggref, Var)); - continue; - } + List *returningList; + List *output_targetlist; + ListCell *l; - tle = tlist_member((Node *)aggref, rqplan->scan.plan.targetlist); - if (!tle) - elog(ERROR, "Could not find the Aggref node"); - rq_aggref = (Aggref *)tle->expr; - Assert(equal(rq_aggref, aggref)); - /* - * Remember the Aggref nodes of which we need to modify. This is done so - * that, if there multiple copies of same aggregate, we will match all - * of them - */ - rq_nodes_to_modify = list_append_unique(rq_nodes_to_modify, rq_aggref); - arg_aggref = rq_aggref; + returningList = (List *) linitial(mt->returningLists); + output_targetlist = NIL; + foreach(l, returningList) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + Var *newvar; + + newvar = makeVar(OUTER_VAR, + tle->resno, + exprType((Node *) tle->expr), + exprTypmod((Node *) tle->expr), + exprCollation((Node *) tle->expr), + 0); + if (IsA(tle->expr, Var)) + { + newvar->varnoold = ((Var *) tle->expr)->varno + rtoffset; + newvar->varoattno = ((Var *) tle->expr)->varattno; + } + else + { + newvar->varnoold = 0; /* wasn't ever a plain Var */ + newvar->varoattno = 0; + } - /* - * If there is a Sort plan, get corresponding expression from there as - * well and remember it to be modified. - */ - if (srtplan) - { - tle = tlist_member((Node *)rq_aggref, srtplan->plan.targetlist); - if (!tle) - elog(ERROR, "Could not find the Aggref node"); - srt_aggref = (Aggref *)tle->expr; - Assert(equal(srt_aggref, rq_aggref)); - srt_nodes_to_modify = list_append_unique(srt_nodes_to_modify, - srt_aggref); - arg_aggref = srt_aggref; + tle = flatCopyTargetEntry(tle); + tle->expr = (Expr *) newvar; + output_targetlist = lappend(output_targetlist, tle); + } + plan->targetlist = output_targetlist; } - - /* - * The transition result from the datanodes acts as an input to the - * Aggref node on coordinator. - */ - aggref->args = list_make1(makeTargetEntry((Expr *)arg_aggref, 1, NULL, - false)); - } - - /* Modify the transition types now */ - foreach (lcell, rq_nodes_to_modify) - { - Aggref *rq_aggref = lfirst(lcell); - Assert(IsA(rq_aggref, Aggref)); - rq_aggref->aggtype = rq_aggref->aggtrantype; } - foreach (lcell, srt_nodes_to_modify) + else { - Aggref *srt_aggref = lfirst(lcell); - Assert(IsA(srt_aggref, Aggref)); - srt_aggref->aggtype = srt_aggref->aggtrantype; + /* + * The RemoteSubplan may look like a subject for a dummy tlist. + * It works in most cases. However it may be a subplan of a ModifyTable + * running against a relation with dropped columns. Sanity check assumes + * that subplan will return a NULL constant as a value for the dropped + * column, however set_dummy_tlist_references would replace it with a + * Var. We cannot detemine the parent plan here, so just process it as + * a scan. Executor will ignore this anyway. + */ + plan->targetlist = fix_scan_list(root, plan->targetlist, rtoffset); } - - /* - * We have modified the targetlist of the RemoteQuery plan below the Agg - * plan. Adjust its targetlist as well. - */ - pgxc_rqplan_adjust_tlist(rqplan); - - return; + Assert(plan->qual == NULL); } -#endif /* PGXC */ +#endif diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 2328c4b66e..ea5363ab07 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -3,6 +3,11 @@ * subselect.c * Planning routines for subselects and parameters. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -496,6 +501,22 @@ make_subplan(PlannerInfo *root, Query *orig_subquery, SubLinkType subLinkType, root, false, tuple_fraction, &subroot); +#ifdef XCP + if (subroot->distribution) + { + plan = (Plan *) make_remotesubplan(subroot, + plan, + NULL, + subroot->distribution, + subroot->query_pathkeys); + /* + * SS_finalize_plan has already been run on the subplan, + * so we have to copy parameter info to wrapper plan node. + */ + plan->extParam = bms_copy(plan->lefttree->extParam); + plan->allParam = bms_copy(plan->lefttree->allParam); + } +#endif /* And convert to SubPlan or InitPlan format. */ result = build_subplan(root, plan, subroot, @@ -1079,6 +1100,22 @@ SS_process_ctes(PlannerInfo *root) root, cte->cterecursive, 0.0, &subroot); +#ifdef XCP + if (subroot->distribution) + { + plan = (Plan *) make_remotesubplan(subroot, + plan, + NULL, + subroot->distribution, + subroot->query_pathkeys); + /* + * SS_finalize_plan has already been run on the subplan, + * so we have to copy parameter info to wrapper plan node. + */ + plan->extParam = bms_copy(plan->lefttree->extParam); + plan->allParam = bms_copy(plan->lefttree->allParam); + } +#endif /* * Make a SubPlan node for it. This is just enough unlike @@ -2242,6 +2279,11 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, break; #endif +#ifdef XCP + case T_RemoteSubplan: + break; +#endif + case T_Append: { ListCell *l; diff --git a/src/backend/optimizer/prep/preptlist.c b/src/backend/optimizer/prep/preptlist.c index 1af4e7fe93..e355f349f2 100644 --- a/src/backend/optimizer/prep/preptlist.c +++ b/src/backend/optimizer/prep/preptlist.c @@ -13,6 +13,11 @@ * between here and there is a bit arbitrary and historical. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -28,6 +33,10 @@ #include "access/sysattr.h" #include "catalog/pg_type.h" #include "nodes/makefuncs.h" +#ifdef XCP +#include "nodes/nodeFuncs.h" +#include "optimizer/clauses.h" +#endif #include "optimizer/prep.h" #include "optimizer/tlist.h" #include "parser/parsetree.h" @@ -75,6 +84,123 @@ preprocess_targetlist(PlannerInfo *root, List *tlist) tlist = expand_targetlist(tlist, command_type, result_relation, range_table); +#ifdef XCP + /* + * If target relation is specified set distribution of the plan + */ + if (result_relation) + { + Relation rel = heap_open(getrelid(result_relation, range_table), + NoLock); + RelationLocInfo *rel_loc_info = rel->rd_locator_info; + + /* Is target table distributed ? */ + if (rel_loc_info) + { + Distribution *distribution = makeNode(Distribution); + ListCell *lc; + + distribution->distributionType = rel_loc_info->locatorType; + foreach(lc, rel_loc_info->nodeList) + distribution->nodes = bms_add_member(distribution->nodes, + lfirst_int(lc)); + distribution->restrictNodes = NULL; + if (rel_loc_info->partAttrNum) + { + /* + * For INSERT and UPDATE plan tlist is matching the target table + * layout + */ + if (command_type == CMD_INSERT || command_type == CMD_UPDATE) + { + TargetEntry *keyTle; + keyTle = (TargetEntry *) list_nth(tlist, + rel_loc_info->partAttrNum - 1); + + distribution->distributionExpr = (Node *) keyTle->expr; + + /* + * We can restrict the distribution if the expression + * is evaluated to a constant + */ + if (command_type == CMD_INSERT) + { + Oid keytype; + Const *constExpr = NULL; + + keytype = exprType(distribution->distributionExpr); + constExpr = (Const *) eval_const_expressions(root, + distribution->distributionExpr); + if (IsA(constExpr, Const) && + constExpr->consttype == keytype) + { + List *nodeList = NIL; + Bitmapset *tmpset = bms_copy(distribution->nodes); + Bitmapset *restrictinfo = NULL; + Locator *locator; + int *nodenums; + int i, count; + + while((i = bms_first_member(tmpset)) >= 0) + nodeList = lappend_int(nodeList, i); + bms_free(tmpset); + + locator = createLocator(distribution->distributionType, + RELATION_ACCESS_INSERT, + keytype, + LOCATOR_LIST_LIST, + 0, + (void *) nodeList, + (void **) &nodenums, + false); + count = GET_NODES(locator, constExpr->constvalue, + constExpr->constisnull, NULL); + + for (i = 0; i < count; i++) + restrictinfo = bms_add_member(restrictinfo, nodenums[i]); + distribution->restrictNodes = restrictinfo; + list_free(nodeList); + freeLocator(locator); + } + } + } + + /* + * For delete we need to add the partitioning key of the target + * table to the tlist, so distribution can be correctly handled + * trough all the planning process. + */ + if (command_type == CMD_DELETE) + { + Form_pg_attribute att_tup; + TargetEntry *tle; + Var *var; + + att_tup = rel->rd_att->attrs[rel_loc_info->partAttrNum - 1]; + var = makeVar(result_relation, rel_loc_info->partAttrNum, + att_tup->atttypid, att_tup->atttypmod, + att_tup->attcollation, 0); + + tle = makeTargetEntry((Expr *) var, + list_length(tlist) + 1, + pstrdup(NameStr(att_tup->attname)), + true); + tlist = lappend(tlist, tle); + distribution->distributionExpr = (Node *) var; + } + } + else + distribution->distributionExpr = NULL; + + root->distribution = distribution; + } + else + root->distribution = NULL; + + heap_close(rel, NoLock); + } +#endif + /* * Add necessary junk columns for rowmarked rels. These values are needed * for locking of rels selected FOR UPDATE/SHARE, and to do EvalPlanQual diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 6475633ae7..a77b86d0e8 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -17,6 +17,11 @@ * append relations, and thenceforth share code with the UNION ALL case. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -243,6 +248,16 @@ recurse_set_operations(Node *setOp, PlannerInfo *root, root, false, tuple_fraction, &subroot); +#ifdef XCP + if (subroot->distribution) + { + subplan = (Plan *) make_remotesubplan(subroot, + subplan, + NULL, + subroot->distribution, + subroot->query_pathkeys); + } +#endif /* Save subroot and subplan in RelOptInfo for setrefs.c */ rel->subplan = subplan; diff --git a/src/backend/optimizer/util/Makefile b/src/backend/optimizer/util/Makefile index 37244ad0be..3b2d16b635 100644 --- a/src/backend/optimizer/util/Makefile +++ b/src/backend/optimizer/util/Makefile @@ -13,6 +13,6 @@ top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global OBJS = clauses.o joininfo.o pathnode.o placeholder.o plancat.o predtest.o \ - relnode.o restrictinfo.o tlist.o var.o pgxcship.o + relnode.o restrictinfo.o tlist.o var.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index ef3a50d82f..cfda133805 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -3,6 +3,11 @@ * pathnode.c * Routines to manipulate pathlists and create path nodes * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -26,12 +31,15 @@ #include "optimizer/tlist.h" #include "parser/parsetree.h" #include "utils/lsyscache.h" -#include "utils/syscache.h" #include "utils/selfuncs.h" -#ifdef PGXC -#include "commands/tablecmds.h" -#include "optimizer/restrictinfo.h" -#endif /* PGXC */ +#ifdef XCP +#include "access/heapam.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "pgxc/locator.h" +#include "pgxc/nodemgr.h" +#include "utils/rel.h" +#endif typedef enum @@ -46,7 +54,15 @@ static void add_parameterized_path(RelOptInfo *parent_rel, Path *new_path); static List *translate_sub_tlist(List *tlist, int relid); static bool query_is_distinct_for(Query *query, List *colnos, List *opids); static Oid distinct_col_search(int colno, List *colnos, List *opids); - +#ifdef XCP +static void restrict_distribution(PlannerInfo *root, RestrictInfo *ri, + Path *pathnode); +static Path *redistribute_path(Path *subpath, char distributionType, + Bitmapset *nodes, Bitmapset *restrictNodes, + Node* distributionExpr); +static void set_scanpath_distribution(PlannerInfo *root, RelOptInfo *rel, Path *pathnode); +static List *set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode); +#endif /***************************************************************************** * MISC. PATH UTILITIES @@ -727,6 +743,926 @@ add_parameterized_path(RelOptInfo *parent_rel, Path *new_path) /***************************************************************************** * PATH NODE CREATION ROUTINES *****************************************************************************/ +#ifdef XCP +/* + * restrict_distribution + * Analyze the RestrictInfo and decide if it is possible to restrict + * distribution nodes + */ +static void +restrict_distribution(PlannerInfo *root, RestrictInfo *ri, + Path *pathnode) +{ + Distribution *distribution = pathnode->distribution; + Oid keytype; + Const *constExpr = NULL; + bool found_key = false; + + /* + * Can not restrict - not distributed or key is not defined + */ + if (distribution == NULL || + distribution->distributionExpr == NULL) + return; + + /* + * We do not support OR'ed conditions yet + */ + if (ri->orclause) + return; + + keytype = exprType(distribution->distributionExpr); + if (ri->left_ec) + { + EquivalenceClass *ec = ri->left_ec; + ListCell *lc; + foreach(lc, ec->ec_members) + { + EquivalenceMember *em = (EquivalenceMember *) lfirst(lc); + if (equal(em->em_expr, distribution->distributionExpr)) + found_key = true; + else if (bms_is_empty(em->em_relids)) + { + Expr *cexpr = (Expr *) eval_const_expressions(root, + (Node *) em->em_expr); + if (IsA(cexpr, Const) && + ((Const *) cexpr)->consttype == keytype) + constExpr = (Const *) cexpr; + } + } + } + if (ri->right_ec) + { + EquivalenceClass *ec = ri->right_ec; + ListCell *lc; + foreach(lc, ec->ec_members) + { + EquivalenceMember *em = (EquivalenceMember *) lfirst(lc); + if (equal(em->em_expr, distribution->distributionExpr)) + found_key = true; + else if (bms_is_empty(em->em_relids)) + { + Expr *cexpr = (Expr *) eval_const_expressions(root, + (Node *) em->em_expr); + if (IsA(cexpr, Const) && + ((Const *) cexpr)->consttype == keytype) + constExpr = (Const *) cexpr; + } + } + } + if (IsA(ri->clause, OpExpr)) + { + OpExpr *opexpr = (OpExpr *) ri->clause; + if (opexpr->args->length == 2 && + op_mergejoinable(opexpr->opno, exprType(linitial(opexpr->args)))) + { + Expr *arg1 = (Expr *) linitial(opexpr->args); + Expr *arg2 = (Expr *) lsecond(opexpr->args); + Expr *other = NULL; + if (equal(arg1, distribution->distributionExpr)) + other = arg2; + else if (equal(arg2, distribution->distributionExpr)) + other = arg1; + if (other) + { + found_key = true; + other = (Expr *) eval_const_expressions(root, (Node *) other); + if (IsA(other, Const) && + ((Const *) other)->consttype == keytype) + constExpr = (Const *) other; + } + } + } + if (found_key && constExpr) + { + List *nodeList = NIL; + Bitmapset *tmpset = bms_copy(distribution->nodes); + Bitmapset *restrictinfo = NULL; + Locator *locator; + int *nodenums; + int i, count; + + while((i = bms_first_member(tmpset)) >= 0) + nodeList = lappend_int(nodeList, i); + bms_free(tmpset); + + locator = createLocator(distribution->distributionType, + RELATION_ACCESS_READ, + keytype, + LOCATOR_LIST_LIST, + 0, + (void *) nodeList, + (void **) &nodenums, + false); + count = GET_NODES(locator, constExpr->constvalue, + constExpr->constisnull, NULL); + + for (i = 0; i < count; i++) + restrictinfo = bms_add_member(restrictinfo, nodenums[i]); + if (distribution->restrictNodes) + distribution->restrictNodes = bms_intersect(distribution->restrictNodes, + restrictinfo); + else + distribution->restrictNodes = restrictinfo; + list_free(nodeList); + freeLocator(locator); + } +} + +/* + * set_scanpath_distribution + * Assign distribution to the path which is a base relation scan. + */ +static void +set_scanpath_distribution(PlannerInfo *root, RelOptInfo *rel, Path *pathnode) +{ + RangeTblEntry *rte; + RelationLocInfo *rel_loc_info; + + rte = planner_rt_fetch(rel->relid, root); + rel_loc_info = GetRelationLocInfo(rte->relid); + if (rel_loc_info) + { + ListCell *lc; + Distribution *distribution = makeNode(Distribution); + distribution->distributionType = rel_loc_info->locatorType; + foreach(lc, rel_loc_info->nodeList) + distribution->nodes = bms_add_member(distribution->nodes, + lfirst_int(lc)); + distribution->restrictNodes = NULL; + /* + * Distribution expression of the base relation is Var representing + * respective attribute. + */ + distribution->distributionExpr = NULL; + if (rel_loc_info->partAttrNum) + { + Var *var = NULL; + ListCell *lc; + + /* Look if the Var is already in the target list */ + foreach (lc, rel->reltargetlist) + { + var = (Var *) lfirst(lc); + if (IsA(var, Var) && var->varno == rel->relid && + var->varattno == rel_loc_info->partAttrNum) + break; + } + /* If not found we should look up the attribute and make the Var */ + if (!lc) + { + Relation relation = heap_open(rte->relid, NoLock); + TupleDesc tdesc = RelationGetDescr(relation); + Form_pg_attribute att_tup; + + att_tup = tdesc->attrs[rel_loc_info->partAttrNum - 1]; + var = makeVar(rel->relid, rel_loc_info->partAttrNum, + att_tup->atttypid, att_tup->atttypmod, + att_tup->attcollation, 0); + + + heap_close(relation, NoLock); + } + + distribution->distributionExpr = (Node *) var; + } + pathnode->distribution = distribution; + } +} + + +/* + * Set a RemoteSubPath on top of the specified node and set specified + * distribution to it + */ +static Path * +redistribute_path(Path *subpath, char distributionType, + Bitmapset *nodes, Bitmapset *restrictNodes, + Node* distributionExpr) +{ + Distribution *distribution = NULL; + RelOptInfo *rel = subpath->parent; + RemoteSubPath *pathnode; + + if (distributionType != LOCATOR_TYPE_NONE) + { + distribution = makeNode(Distribution); + distribution->distributionType = distributionType; + distribution->nodes = nodes; + distribution->restrictNodes = restrictNodes; + distribution->distributionExpr = distributionExpr; + } + + /* + * If inner path node is a MaterialPath pull it up to store tuples on + * the destination nodes and avoid sending them over the network. + */ + if (IsA(subpath, MaterialPath)) + { + MaterialPath *mpath = (MaterialPath *) subpath; + /* If subpath is already a RemoteSubPath, just replace distribution */ + if (IsA(mpath->subpath, RemoteSubPath)) + { + pathnode = (RemoteSubPath *) mpath->subpath; + } + else + { + pathnode = makeNode(RemoteSubPath); + pathnode->path.pathtype = T_RemoteSubplan; + pathnode->path.parent = rel; + pathnode->path.param_info = subpath->param_info; + pathnode->path.pathkeys = subpath->pathkeys; + pathnode->subpath = mpath->subpath; + mpath->subpath = (Path *) pathnode; + } + subpath = pathnode->subpath; + pathnode->path.distribution = distribution; + mpath->path.distribution = (Distribution *) copyObject(distribution); + /* (re)calculate costs */ + cost_remote_subplan((Path *) pathnode, subpath->startup_cost, + subpath->total_cost, subpath->rows, rel->width, + IsLocatorReplicated(distributionType) ? + bms_num_members(nodes) : 1); + mpath->subpath = (Path *) pathnode; + cost_material(&mpath->path, + pathnode->path.startup_cost, + pathnode->path.total_cost, + pathnode->path.rows, + rel->width); + return (Path *) mpath; + } + else + { + pathnode = makeNode(RemoteSubPath); + pathnode->path.pathtype = T_RemoteSubplan; + pathnode->path.parent = rel; + pathnode->path.param_info = subpath->param_info; + pathnode->path.pathkeys = subpath->pathkeys; + pathnode->subpath = subpath; + pathnode->path.distribution = distribution; + cost_remote_subplan((Path *) pathnode, subpath->startup_cost, + subpath->total_cost, subpath->rows, rel->width, + IsLocatorReplicated(distributionType) ? + bms_num_members(nodes) : 1); + return (Path *) pathnode; + } +} + + +static JoinPath * +flatCopyJoinPath(JoinPath *pathnode) +{ + JoinPath *newnode; + size_t size = 0; + switch(nodeTag(pathnode)) + { + case T_NestPath: + size = sizeof(NestPath); + break; + case T_MergePath: + size = sizeof(MergePath); + break; + case T_HashPath: + size = sizeof(HashPath); + break; + default: + elog(ERROR, "unrecognized node type: %d", (int) nodeTag(pathnode)); + break; + } + newnode = (JoinPath *) palloc(size); + memcpy(newnode, pathnode, size); + return newnode; +} + + +/* + * Analyze join parameters and set distribution of the join node. + * If there are possible alternate distributions the respective pathes are + * returned as a list so caller can cost all of them and choose cheapest to + * continue. + */ +static List * +set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode) +{ + Distribution *innerd = pathnode->innerjoinpath->distribution; + Distribution *outerd = pathnode->outerjoinpath->distribution; + Distribution *targetd; + List *alternate = NIL; + + /* Catalog join */ + if (innerd == NULL && outerd == NULL) + return NIL; + + /* + * If both subpaths are distributed by replication, the resulting + * distribution will be replicated on smallest common set of nodes. + * Catalog tables are the same on all nodes, so treat them as replicated + * on all nodes. + */ + if ((!innerd || IsLocatorReplicated(innerd->distributionType)) && + (!outerd || IsLocatorReplicated(outerd->distributionType))) + { + /* Determine common nodes */ + Bitmapset *common; + + if (innerd == NULL) + common = bms_copy(outerd->nodes); + else if (outerd == NULL) + common = bms_copy(innerd->nodes); + else + common = bms_intersect(innerd->nodes, outerd->nodes); + if (bms_is_empty(common)) + goto not_allowed_join; + + /* + * Join result is replicated on common nodes. Running query on any + * of them produce correct result. + */ + targetd = makeNode(Distribution); + targetd->distributionType = LOCATOR_TYPE_REPLICATED; + targetd->nodes = common; + targetd->restrictNodes = NULL; + pathnode->path.distribution = targetd; + return alternate; + } + + /* + * Check if we have inner replicated + * The "both replicated" case is already checked, so if innerd + * is replicated, then outerd is not replicated and it is not NULL. + * This case is not acceptable for some join types. If outer relation is + * nullable data nodes will produce joined rows with NULLs for cases when + * matching row exists, but on other data node. + */ + if ((!innerd || IsLocatorReplicated(innerd->distributionType)) && + (pathnode->jointype == JOIN_INNER || + pathnode->jointype == JOIN_LEFT || + pathnode->jointype == JOIN_SEMI || + pathnode->jointype == JOIN_ANTI)) + { + /* We need inner relation is defined on all nodes where outer is */ + if (innerd && !bms_is_subset(outerd->nodes, innerd->nodes)) + goto not_allowed_join; + + targetd = makeNode(Distribution); + targetd->distributionType = outerd->distributionType; + targetd->nodes = bms_copy(outerd->nodes); + targetd->restrictNodes = bms_copy(outerd->restrictNodes); + targetd->distributionExpr = outerd->distributionExpr; + pathnode->path.distribution = targetd; + return alternate; + } + + + /* + * Check if we have outer replicated + * The "both replicated" case is already checked, so if outerd + * is replicated, then innerd is not replicated and it is not NULL. + * This case is not acceptable for some join types. If inner relation is + * nullable data nodes will produce joined rows with NULLs for cases when + * matching row exists, but on other data node. + */ + if ((!outerd || IsLocatorReplicated(outerd->distributionType)) && + (pathnode->jointype == JOIN_INNER || + pathnode->jointype == JOIN_RIGHT)) + { + /* We need outer relation is defined on all nodes where inner is */ + if (outerd && !bms_is_subset(innerd->nodes, outerd->nodes)) + goto not_allowed_join; + + targetd = makeNode(Distribution); + targetd->distributionType = innerd->distributionType; + targetd->nodes = bms_copy(innerd->nodes); + targetd->restrictNodes = bms_copy(innerd->restrictNodes); + targetd->distributionExpr = innerd->distributionExpr; + pathnode->path.distribution = targetd; + return alternate; + } + + + /* + * This join is still allowed if inner and outer paths have + * equivalent distribution and joined along the distribution keys. + */ + if (innerd && outerd && + innerd->distributionType == outerd->distributionType && + innerd->distributionExpr && + outerd->distributionExpr && + bms_equal(innerd->nodes, outerd->nodes)) + { + ListCell *lc; + + /* + * Make sure distribution functions are the same, for now they depend + * on data type + */ + if (exprType((Node *) innerd->distributionExpr) != exprType((Node *) outerd->distributionExpr)) + goto not_allowed_join; + + /* + * Planner already did necessary work and if there is a join + * condition like left.key=right.key the key expressions + * will be members of the same equivalence class, and both + * sides of the corresponding RestrictInfo will refer that + * Equivalence Class. + * Try to figure out if such restriction exists. + */ + foreach(lc, pathnode->joinrestrictinfo) + { + RestrictInfo *ri = (RestrictInfo *) lfirst(lc); + ListCell *emc; + bool found_outer, found_inner; + + /* + * Restriction operator is not equality operator ? + */ + if (ri->left_ec == NULL || ri->right_ec == NULL) + continue; + + /* + * A restriction with OR may be compatible if all OR'ed + * conditions are compatible. For the moment we do not + * check this and skip restriction. The case if multiple + * OR'ed conditions are compatible is rare and probably + * do not worth doing at all. + */ + if (ri->orclause) + continue; + + found_outer = false; + found_inner = false; + + /* + * If parts belong to the same equivalence member check + * if both distribution keys are members of the class. + */ + if (ri->left_ec == ri->right_ec) + { + foreach(emc, ri->left_ec->ec_members) + { + EquivalenceMember *em = (EquivalenceMember *) lfirst(emc); + Expr *var = (Expr *)em->em_expr; + if (!found_outer) + found_outer = equal(var, outerd->distributionExpr); + + if (!found_inner) + found_inner = equal(var, innerd->distributionExpr); + } + if (found_outer && found_inner) + { + ListCell *tlc, *emc; + + targetd = makeNode(Distribution); + targetd->distributionType = innerd->distributionType; + targetd->nodes = bms_copy(innerd->nodes); + targetd->restrictNodes = bms_copy(innerd->restrictNodes); + targetd->distributionExpr = NULL; + pathnode->path.distribution = targetd; + + /* + * Each member of the equivalence class may be a + * distribution expression, but we prefer some from the + * target list. + */ + foreach(tlc, pathnode->path.parent->reltargetlist) + { + Expr *var = (Expr *) lfirst(tlc); + foreach(emc, ri->left_ec->ec_members) + { + EquivalenceMember *em; + Expr *emvar; + + em = (EquivalenceMember *) lfirst(emc); + emvar = (Expr *)em->em_expr; + if (equal(var, emvar)) + { + targetd->distributionExpr = (Node *) var; + return alternate; + } + } + } + /* Not found, take any */ + targetd->distributionExpr = innerd->distributionExpr; + return alternate; + } + } + /* + * Check clause, if both arguments are distribution keys and + * operator is an equality operator + */ + else + { + OpExpr *op_exp; + Expr *arg1, + *arg2; + + op_exp = (OpExpr *) ri->clause; + if (!IsA(op_exp, OpExpr) || list_length(op_exp->args) != 2) + continue; + + arg1 = (Expr *) linitial(op_exp->args); + arg2 = (Expr *) lsecond(op_exp->args); + + found_outer = equal(arg1, outerd->distributionExpr) || equal(arg2, outerd->distributionExpr); + found_inner = equal(arg1, innerd->distributionExpr) || equal(arg2, innerd->distributionExpr); + + if (found_outer && found_inner) + { + targetd = makeNode(Distribution); + targetd->distributionType = innerd->distributionType; + targetd->nodes = bms_copy(innerd->nodes); + targetd->restrictNodes = bms_copy(innerd->restrictNodes); + pathnode->path.distribution = targetd; + + /* + * In case of outer join distribution key should not refer + * distribution key of nullable part. + */ + if (pathnode->jointype == JOIN_FULL) + /* both parts are nullable */ + targetd->distributionExpr = NULL; + else if (pathnode->jointype == JOIN_RIGHT) + targetd->distributionExpr = innerd->distributionExpr; + else + targetd->distributionExpr = outerd->distributionExpr; + + return alternate; + } + } + } + } + + /* + * If we could not determine the distribution redistribute the subpathes. + */ +not_allowed_join: + /* + * If redistribution is required, sometimes the cheapest path would be if + * one of the subplan is replicated. If replication of any or all subplans + * is possible, return resulting plans as alternates. Try to distribute all + * by has as main variant. + */ + + /* These join types allow replicated inner */ + if (outerd && + (pathnode->jointype == JOIN_INNER || + pathnode->jointype == JOIN_LEFT || + pathnode->jointype == JOIN_SEMI || + pathnode->jointype == JOIN_ANTI)) + { + /* + * Since we discard all alternate pathes except one it is OK if all they + * reference the same objects + */ + JoinPath *altpath = flatCopyJoinPath(pathnode); + /* Redistribute inner subquery */ + altpath->innerjoinpath = redistribute_path( + altpath->innerjoinpath, + LOCATOR_TYPE_REPLICATED, + bms_copy(outerd->nodes), + bms_copy(outerd->restrictNodes), + NULL); + targetd = makeNode(Distribution); + targetd->distributionType = outerd->distributionType; + targetd->nodes = bms_copy(outerd->nodes); + targetd->restrictNodes = bms_copy(outerd->restrictNodes); + targetd->distributionExpr = outerd->distributionExpr; + altpath->path.distribution = targetd; + alternate = lappend(alternate, altpath); + } + + /* These join types allow replicated outer */ + if (innerd && + (pathnode->jointype == JOIN_INNER || + pathnode->jointype == JOIN_RIGHT)) + { + /* + * Since we discard all alternate pathes except one it is OK if all they + * reference the same objects + */ + JoinPath *altpath = flatCopyJoinPath(pathnode); + /* Redistribute inner subquery */ + altpath->outerjoinpath = redistribute_path( + altpath->outerjoinpath, + LOCATOR_TYPE_REPLICATED, + bms_copy(innerd->nodes), + bms_copy(innerd->restrictNodes), + NULL); + targetd = makeNode(Distribution); + targetd->distributionType = innerd->distributionType; + targetd->nodes = bms_copy(innerd->nodes); + targetd->restrictNodes = bms_copy(innerd->restrictNodes); + targetd->distributionExpr = innerd->distributionExpr; + altpath->path.distribution = targetd; + alternate = lappend(alternate, altpath); + } + + /* + * Redistribute subplans to make them compatible. + * If any of the subplans is a coordinator subplan skip this stuff and do + * coordinator join. + */ + if (innerd && outerd) + { + RestrictInfo *preferred = NULL; + Expr *new_inner_key = NULL; + Expr *new_outer_key = NULL; + char distType = LOCATOR_TYPE_NONE; + ListCell *lc; + + /* + * Look through the join restrictions to find one that is a hashable + * operator on two arguments. Choose best restriction acoording to + * following criteria: + * 1. one argument is already a partitioning key of one subplan. + * 2. restriction is cheaper to calculate + */ + foreach(lc, pathnode->joinrestrictinfo) + { + RestrictInfo *ri = (RestrictInfo *) lfirst(lc); + + /* can not handle ORed conditions */ + if (ri->orclause) + continue; + + if (IsA(ri->clause, OpExpr)) + { + OpExpr *expr = (OpExpr *) ri->clause; + if (list_length(expr->args) == 2 && + op_hashjoinable(expr->opno, exprType(linitial(expr->args)))) + { + Expr *left = (Expr *) linitial(expr->args); + Expr *right = (Expr *) lsecond(expr->args); + Oid leftType = exprType((Node *) left); + Oid rightType = exprType((Node *) right); + Relids inner_rels = pathnode->innerjoinpath->parent->relids; + Relids outer_rels = pathnode->outerjoinpath->parent->relids; + QualCost cost; + + /* + * Check if both parts are of the same data type and choose + * distribution type to redistribute. + * XXX We may want more sophisticated algorithm to choose + * the best condition to redistribute parts along. + * For now use simple but reliable approach. + */ + if (leftType != rightType) + continue; + /* + * Evaluation cost will be needed to choose preferred + * distribution + */ + cost_qual_eval_node(&cost, (Node *) ri, root); + + if (outerd->distributionExpr) + { + /* + * If left side is distribution key of outer subquery + * and right expression refers only inner subquery + */ + if (equal(outerd->distributionExpr, left) && + bms_is_subset(ri->right_relids, inner_rels)) + { + if (!preferred || /* no preferred restriction yet found */ + (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */ + (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */ + { + /* set new preferred restriction */ + preferred = ri; + new_inner_key = right; + new_outer_key = NULL; /* no need to change */ + distType = outerd->distributionType; + } + continue; + } + /* + * If right side is distribution key of outer subquery + * and left expression refers only inner subquery + */ + if (equal(outerd->distributionExpr, right) && + bms_is_subset(ri->left_relids, inner_rels)) + { + if (!preferred || /* no preferred restriction yet found */ + (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */ + (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */ + { + /* set new preferred restriction */ + preferred = ri; + new_inner_key = left; + new_outer_key = NULL; /* no need to change */ + distType = outerd->distributionType; + } + continue; + } + } + if (innerd->distributionExpr) + { + /* + * If left side is distribution key of inner subquery + * and right expression refers only outer subquery + */ + if (equal(innerd->distributionExpr, left) && + bms_is_subset(ri->right_relids, outer_rels)) + { + if (!preferred || /* no preferred restriction yet found */ + (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */ + (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */ + { + /* set new preferred restriction */ + preferred = ri; + new_inner_key = NULL; /* no need to change */ + new_outer_key = right; + distType = innerd->distributionType; + } + continue; + } + /* + * If right side is distribution key of inner subquery + * and left expression refers only outer subquery + */ + if (equal(innerd->distributionExpr, right) && + bms_is_subset(ri->left_relids, outer_rels)) + { + if (!preferred || /* no preferred restriction yet found */ + (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */ + (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */ + { + /* set new preferred restriction */ + preferred = ri; + new_inner_key = NULL; /* no need to change */ + new_outer_key = left; + distType = innerd->distributionType; + } + continue; + } + } + /* + * Current restriction recuire redistribution of both parts. + * If preferred restriction require redistribution of one, + * keep it. + */ + if (preferred && + (new_inner_key == NULL || new_outer_key == NULL)) + continue; + + /* + * Skip this condition if the data type of the expressions + * does not allow either HASH or MODULO distribution. + * HASH distribution is preferrable. + */ + if (IsTypeHashDistributable(leftType)) + distType = LOCATOR_TYPE_HASH; + else if (IsTypeModuloDistributable(leftType)) + distType = LOCATOR_TYPE_MODULO; + else + continue; + /* + * If this restriction the first or easier to calculate + * then preferred, try to store it as new preferred + * restriction to redistribute along it. + */ + if (preferred == NULL || + (cost.per_tuple < preferred->eval_cost.per_tuple)) + { + /* + * Left expression depends only on outer subpath and + * right expression depends only on inner subpath, so + * we can redistribute both and make left expression the + * distribution key of outer subplan and right + * expression the distribution key of inner subplan + */ + if (bms_is_subset(ri->left_relids, outer_rels) && + bms_is_subset(ri->right_relids, inner_rels)) + { + preferred = ri; + new_outer_key = left; + new_inner_key = right; + } + /* + * Left expression depends only on inner subpath and + * right expression depends only on outer subpath, so + * we can redistribute both and make left expression the + * distribution key of inner subplan and right + * expression the distribution key of outer subplan + */ + if (bms_is_subset(ri->left_relids, inner_rels) && + bms_is_subset(ri->right_relids, outer_rels)) + { + preferred = ri; + new_inner_key = left; + new_outer_key = right; + } + } + } + } + } + /* If we have suitable restriction we can repartition accordingly */ + if (preferred) + { + Bitmapset *nodes = NULL; + Bitmapset *restrictNodes = NULL; + + /* If we redistribute both parts do join on all nodes ... */ + if (new_inner_key && new_outer_key) + { + int i; + for (i = 0; i < NumDataNodes; i++) + nodes = bms_add_member(nodes, i); + } + /* + * ... if we do only one of them redistribute it on the same nodes + * as other. + */ + else if (new_inner_key) + { + nodes = bms_copy(outerd->nodes); + restrictNodes = bms_copy(outerd->restrictNodes); + } + else /*if (new_outer_key)*/ + { + nodes = bms_copy(innerd->nodes); + restrictNodes = bms_copy(innerd->restrictNodes); + } + + /* + * Redistribute join by hash, and, if jointype allows, create + * alternate path where inner subplan is distributed by replication + */ + if (new_inner_key) + { + /* Redistribute inner subquery */ + pathnode->innerjoinpath = redistribute_path( + pathnode->innerjoinpath, + distType, + nodes, + restrictNodes, + (Node *) new_inner_key); + } + /* + * Redistribute join by hash, and, if jointype allows, create + * alternate path where outer subplan is distributed by replication + */ + if (new_outer_key) + { + /* Redistribute outer subquery */ + pathnode->outerjoinpath = redistribute_path( + pathnode->outerjoinpath, + distType, + nodes, + restrictNodes, + (Node *) new_outer_key); + } + targetd = makeNode(Distribution); + targetd->distributionType = distType; + targetd->nodes = nodes; + targetd->restrictNodes = NULL; + pathnode->path.distribution = targetd; + /* + * In case of outer join distribution key should not refer + * distribution key of nullable part. + * NB: we should not refer innerd and outerd here, subpathes are + * redistributed already + */ + if (pathnode->jointype == JOIN_FULL) + /* both parts are nullable */ + targetd->distributionExpr = NULL; + else if (pathnode->jointype == JOIN_RIGHT) + targetd->distributionExpr = + pathnode->innerjoinpath->distribution->distributionExpr; + else + targetd->distributionExpr = + pathnode->outerjoinpath->distribution->distributionExpr; + + return alternate; + } + } + + /* + * Build cartesian product, if no hasheable restrictions is found. + * Perform coordinator join in such cases. If this join would be a part of + * larger join, it will be handled as replicated. + * To do that leave join distribution NULL and place a RemoteSubPath node on + * top of each subpath to provide access to joined result sets. + * Do not redistribute pathes that already have NULL distribution, this is + * possible if performing outer join on a coordinator and a datanode + * relations. + */ + if (innerd) + pathnode->innerjoinpath = redistribute_path(pathnode->innerjoinpath, + LOCATOR_TYPE_NONE, + NULL, + NULL, + NULL); + if (outerd) + pathnode->outerjoinpath = redistribute_path(pathnode->outerjoinpath, + LOCATOR_TYPE_NONE, + NULL, + NULL, + NULL); + return alternate; +} +#endif + /* * create_seqscan_path @@ -744,6 +1680,19 @@ create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer) required_outer); pathnode->pathkeys = NIL; /* seqscan has unordered result */ +#ifdef XCP + set_scanpath_distribution(root, rel, pathnode); + if (rel->baserestrictinfo) + { + ListCell *lc; + foreach (lc, rel->baserestrictinfo) + { + RestrictInfo *ri = (RestrictInfo *) lfirst(lc); + restrict_distribution(root, ri, pathnode); + } + } +#endif + cost_seqscan(pathnode, root, rel, pathnode->param_info); return pathnode; @@ -810,6 +1759,18 @@ create_index_path(PlannerInfo *root, pathnode->indexorderbycols = indexorderbycols; pathnode->indexscandir = indexscandir; +#ifdef XCP + set_scanpath_distribution(root, rel, (Path *) pathnode); + if (indexclauses) + { + ListCell *lc; + foreach (lc, indexclauses) + { + RestrictInfo *ri = (RestrictInfo *) lfirst(lc); + restrict_distribution(root, ri, (Path *) pathnode); + } + } +#endif cost_index(pathnode, root, loop_count); return pathnode; @@ -844,6 +1805,19 @@ create_bitmap_heap_path(PlannerInfo *root, pathnode->bitmapqual = bitmapqual; +#ifdef XCP + set_scanpath_distribution(root, rel, (Path *) pathnode); + if (rel->baserestrictinfo) + { + ListCell *lc; + foreach (lc, rel->baserestrictinfo) + { + RestrictInfo *ri = (RestrictInfo *) lfirst(lc); + restrict_distribution(root, ri, (Path *) pathnode); + } + } +#endif + cost_bitmap_heap_scan(&pathnode->path, root, rel, pathnode->path.param_info, bitmapqual, loop_count); @@ -869,6 +1843,10 @@ create_bitmap_and_path(PlannerInfo *root, pathnode->bitmapquals = bitmapquals; +#ifdef XCP + set_scanpath_distribution(root, rel, (Path *) pathnode); +#endif + /* this sets bitmapselectivity as well as the regular cost fields: */ cost_bitmap_and_node(pathnode, root); @@ -893,6 +1871,10 @@ create_bitmap_or_path(PlannerInfo *root, pathnode->bitmapquals = bitmapquals; +#ifdef XCP + set_scanpath_distribution(root, rel, (Path *) pathnode); +#endif + /* this sets bitmapselectivity as well as the regular cost fields: */ cost_bitmap_or_node(pathnode, root); @@ -915,6 +1897,13 @@ create_tidscan_path(PlannerInfo *root, RelOptInfo *rel, List *tidquals) pathnode->tidquals = tidquals; +#ifdef XCP + set_scanpath_distribution(root, rel, (Path *) pathnode); + /* We may need to pass info about target node to support */ + if (pathnode->path.distribution) + elog(ERROR, "could not perform TID scan on remote relation"); +#endif + cost_tidscan(&pathnode->path, root, rel, tidquals); return pathnode; @@ -932,6 +1921,10 @@ create_append_path(RelOptInfo *rel, List *subpaths, Relids required_outer) { AppendPath *pathnode = makeNode(AppendPath); ListCell *l; +#ifdef XCP + Distribution *distribution; + Path *subpath; +#endif pathnode->path.pathtype = T_Append; pathnode->path.parent = rel; @@ -939,6 +1932,65 @@ create_append_path(RelOptInfo *rel, List *subpaths, Relids required_outer) required_outer); pathnode->path.pathkeys = NIL; /* result is always considered * unsorted */ +#ifdef XCP + /* + * Append path is used to implement scans of inherited tables and some + * "set" operations, like UNION ALL. While all inherited tables should + * have the same distribution, UNION'ed queries may have different. + * When paths being appended have the same distribution it is OK to push + * Append down to the data nodes. If not, perform "coordinator" Append. + */ + + /* Special case of the dummy relation, if the subpaths list is empty */ + if (subpaths) + { + /* Take distribution of the first node */ + l = list_head(subpaths); + subpath = (Path *) lfirst(l); + distribution = copyObject(subpath->distribution); + /* + * Check remaining subpaths, if all distributions equal to the first set + * it as a distribution of the Append path; otherwise make up coordinator + * Append + */ + while ((l = lnext(l))) + { + subpath = (Path *) lfirst(l); + + if (equal(distribution, subpath->distribution)) + { + /* + * Both distribution and subpath->distribution may be NULL at + * this point, or they both are not null. + */ + if (distribution && subpath->distribution->restrictNodes) + distribution->restrictNodes = bms_union( + distribution->restrictNodes, + subpath->distribution->restrictNodes); + } + else + { + break; + } + } + if (l) + { + List *newsubpaths = NIL; + foreach(l, subpaths) + { + subpath = (Path *) lfirst(l); + if (subpath->distribution) + subpath = redistribute_path(subpath, LOCATOR_TYPE_NONE, + NULL, NULL, NULL); + newsubpaths = lappend(newsubpaths, subpath); + } + subpaths = newsubpaths; + pathnode->path.distribution = NULL; + } + else + pathnode->path.distribution = distribution; + } +#endif pathnode->subpaths = subpaths; /* @@ -985,9 +2037,70 @@ create_merge_append_path(PlannerInfo *root, Cost input_startup_cost; Cost input_total_cost; ListCell *l; +#ifdef XCP + Distribution *distribution = NULL; + Path *subpath; +#endif pathnode->path.pathtype = T_MergeAppend; pathnode->path.parent = rel; +#ifdef XCP + /* + * It is safe to push down MergeAppend if all subpath distributions + * are the same and these distributions are Replicated or distribution key + * is the expression of the first pathkey. + */ + /* Take distribution of the first node */ + l = list_head(subpaths); + subpath = (Path *) lfirst(l); + distribution = copyObject(subpath->distribution); + /* + * Verify if it is safe to push down MergeAppend with this distribution. + * TODO implement check of the second condition (distribution key is the + * first pathkey) + */ + if (distribution == NULL || IsLocatorReplicated(distribution->distributionType)) + { + /* + * Check remaining subpaths, if all distributions equal to the first set + * it as a distribution of the Append path; otherwise make up coordinator + * Append + */ + while ((l = lnext(l))) + { + subpath = (Path *) lfirst(l); + + if (distribution && equal(distribution, subpath->distribution)) + { + if (subpath->distribution->restrictNodes) + distribution->restrictNodes = bms_union( + distribution->restrictNodes, + subpath->distribution->restrictNodes); + } + else + { + break; + } + } + } + if (l) + { + List *newsubpaths = NIL; + foreach(l, subpaths) + { + subpath = (Path *) lfirst(l); + if (subpath->distribution) + subpath = redistribute_path(subpath, LOCATOR_TYPE_NONE, + NULL, NULL, NULL); + newsubpaths = lappend(newsubpaths, subpath); + } + subpaths = newsubpaths; + pathnode->path.distribution = NULL; + } + else + pathnode->path.distribution = distribution; +#endif + pathnode->path.param_info = get_appendrel_parampathinfo(rel, required_outer); pathnode->path.pathkeys = pathkeys; @@ -1101,6 +2214,10 @@ create_material_path(RelOptInfo *rel, Path *subpath) pathnode->subpath = subpath; +#ifdef XCP + pathnode->path.distribution = (Distribution *) copyObject(subpath->distribution); +#endif + cost_material(&pathnode->path, subpath->startup_cost, subpath->total_cost, @@ -1297,6 +2414,32 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, if (contain_volatile_functions((Node *) uniq_exprs)) goto no_unique_path; +#ifdef XCP + /* + * We may only guarantee uniqueness if subplan is either replicated or it is + * partitioned and one of the unigue expressions equals to the + * distribution expression. + */ + if (subpath->distribution && + !IsLocatorReplicated(subpath->distribution->distributionType)) + { + /* Punt if no distribution key */ + if (subpath->distribution->distributionExpr == NULL) + goto no_unique_path; + + foreach(lc, uniq_exprs) + { + void *expr = lfirst(lc); + if (equal(expr, subpath->distribution->distributionExpr)) + break; + } + + /* XXX we may try and repartition if no matching expression */ + if (!lc) + goto no_unique_path; + } +#endif + /* * If we get here, we can unique-ify using at least one of sorting and * hashing. Start building the result Path object. @@ -1317,6 +2460,11 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->in_operators = in_operators; pathnode->uniq_exprs = uniq_exprs; +#ifdef XCP + /* distribution is the same as in the subpath */ + pathnode->path.distribution = (Distribution *) copyObject(subpath->distribution); +#endif + /* * If the input is a relation and it has a unique index that proves the * uniq_exprs are unique, then we don't need to do anything. Note that @@ -1640,8 +2788,14 @@ distinct_col_search(int colno, List *colnos, List *opids) * returning the pathnode. */ Path * +#ifdef XCP +create_subqueryscan_path(PlannerInfo *root, RelOptInfo *rel, + List *pathkeys, Relids required_outer, + Distribution *distribution) +#else create_subqueryscan_path(PlannerInfo *root, RelOptInfo *rel, List *pathkeys, Relids required_outer) +#endif { Path *pathnode = makeNode(Path); @@ -1650,6 +2804,9 @@ create_subqueryscan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->pathkeys = pathkeys; +#ifdef XCP + pathnode->distribution = distribution; +#endif cost_subqueryscan(pathnode, root, rel, pathnode->param_info); @@ -1737,6 +2894,33 @@ create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel) return pathnode; } + +#ifdef PGXC +#ifndef XCP +/* + * create_remotequery_path + * Creates a path corresponding to a scan of a remote query, + * returning the pathnode. + */ +Path * +create_remotequery_path(PlannerInfo *root, RelOptInfo *rel) +{ + Path *pathnode = makeNode(Path); + + pathnode->pathtype = T_RemoteQuery; + pathnode->parent = rel; + pathnode->param_info = NULL; /* never parameterized at present */ + pathnode->pathkeys = NIL; /* result is always unordered */ + + /* PGXCTODO - set cost properly */ + cost_seqscan(pathnode, root, rel, pathnode->param_info); + + return pathnode; +} +#endif /* XCP */ +#endif /* PGXC */ + + /* * create_foreignscan_path * Creates a path corresponding to a scan of a foreign table, @@ -1856,6 +3040,10 @@ create_nestloop_path(PlannerInfo *root, Relids required_outer) { NestPath *pathnode = makeNode(NestPath); +#ifdef XCP + List *alternate; + ListCell *lc; +#endif Relids inner_req_outer = PATH_REQ_OUTER(inner_path); /* @@ -1900,8 +3088,24 @@ create_nestloop_path(PlannerInfo *root, pathnode->innerjoinpath = inner_path; pathnode->joinrestrictinfo = restrict_clauses; +#ifdef XCP + alternate = set_joinpath_distribution(root, pathnode); +#endif final_cost_nestloop(root, pathnode, workspace, sjinfo, semifactors); +#ifdef XCP + /* + * Also calculate costs of all alternates and return cheapest path + */ + foreach(lc, alternate) + { + NestPath *altpath = (NestPath *) lfirst(lc); + final_cost_nestloop(root, altpath, workspace, sjinfo, semifactors); + if (altpath->path.total_cost < pathnode->path.total_cost) + pathnode = altpath; + } +#endif + return pathnode; } @@ -1940,6 +3144,10 @@ create_mergejoin_path(PlannerInfo *root, List *innersortkeys) { MergePath *pathnode = makeNode(MergePath); +#ifdef XCP + List *alternate; + ListCell *lc; +#endif pathnode->jpath.path.pathtype = T_MergeJoin; pathnode->jpath.path.parent = joinrel; @@ -1959,10 +3167,25 @@ create_mergejoin_path(PlannerInfo *root, pathnode->path_mergeclauses = mergeclauses; pathnode->outersortkeys = outersortkeys; pathnode->innersortkeys = innersortkeys; +#ifdef XCP + alternate = set_joinpath_distribution(root, (JoinPath *) pathnode); +#endif /* pathnode->materialize_inner will be set by final_cost_mergejoin */ - final_cost_mergejoin(root, pathnode, workspace, sjinfo); +#ifdef XCP + /* + * Also calculate costs of all alternates and return cheapest path + */ + foreach(lc, alternate) + { + MergePath *altpath = (MergePath *) lfirst(lc); + final_cost_mergejoin(root, altpath, workspace, sjinfo); + if (altpath->jpath.path.total_cost < pathnode->jpath.path.total_cost) + pathnode = altpath; + } +#endif + return pathnode; } @@ -1996,6 +3219,10 @@ create_hashjoin_path(PlannerInfo *root, List *hashclauses) { HashPath *pathnode = makeNode(HashPath); +#ifdef XCP + List *alternate; + ListCell *lc; +#endif pathnode->jpath.path.pathtype = T_HashJoin; pathnode->jpath.path.parent = joinrel; @@ -2025,10 +3252,25 @@ create_hashjoin_path(PlannerInfo *root, pathnode->jpath.innerjoinpath = inner_path; pathnode->jpath.joinrestrictinfo = restrict_clauses; pathnode->path_hashclauses = hashclauses; +#ifdef XCP + alternate = set_joinpath_distribution(root, (JoinPath *) pathnode); +#endif /* final_cost_hashjoin will fill in pathnode->num_batches */ - final_cost_hashjoin(root, pathnode, workspace, sjinfo, semifactors); +#ifdef XCP + /* + * Calculate costs of all alternates and return cheapest path + */ + foreach(lc, alternate) + { + HashPath *altpath = (HashPath *) lfirst(lc); + final_cost_hashjoin(root, altpath, workspace, sjinfo, semifactors); + if (altpath->jpath.path.total_cost < pathnode->jpath.path.total_cost) + pathnode = altpath; + } +#endif + return pathnode; } @@ -2093,8 +3335,13 @@ reparameterize_path(PlannerInfo *root, Path *path, loop_count); } case T_SubqueryScan: +#ifdef XCP + return create_subqueryscan_path(root, rel, path->pathkeys, + required_outer, path->distribution); +#else return create_subqueryscan_path(root, rel, path->pathkeys, required_outer); +#endif default: break; } diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 54fe5732da..bc7e8a6096 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -4,6 +4,11 @@ * routines for accessing the system catalogs * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -338,6 +343,16 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, */ if (info->indpred == NIL) { +#ifdef XCP + /* + * If parent relation is distributed the local storage manager + * does not have actual information about index size. + * We have to get relation statistics instead. + */ + if (IS_PGXC_COORDINATOR && relation->rd_locator_info != NULL) + info->pages = indexRelation->rd_rel->relpages; + else +#endif info->pages = RelationGetNumberOfBlocks(indexRelation); info->tuples = rel->tuples; } @@ -396,7 +411,8 @@ estimate_rel_size(Relation rel, int32 *attr_widths, { case RELKIND_RELATION: #ifdef PGXC - /* +#ifndef XCP + /* * This is a remote table... we have no idea how many pages/rows * we may get from a scan of this table. However, we should set the * costs in such a manner that cheapest paths should pick up the @@ -419,8 +435,21 @@ estimate_rel_size(Relation rel, int32 *attr_widths, break; } #endif +#endif case RELKIND_INDEX: case RELKIND_TOASTVALUE: +#ifdef XCP + if (IS_PGXC_COORDINATOR && rel->rd_locator_info != NULL) + { + /* + * Remote table does not store rows locally, so storage manager + * does not know how many pages are there, we rely on relation + * statistics. + */ + curpages = rel->rd_rel->relpages; + } + else +#endif /* it has storage, ok to call the smgr */ curpages = RelationGetNumberOfBlocks(rel); diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index dce0b9330e..85feefdfce 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -14,6 +14,11 @@ * contain optimizable statements, which we should transform. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -25,6 +30,11 @@ #include "postgres.h" #include "access/sysattr.h" +#ifdef XCP +#include "catalog/pg_namespace.h" +#include "catalog/namespace.h" +#include "utils/builtins.h" +#endif #ifdef PGXC #include "catalog/pg_inherits.h" #include "catalog/pg_inherits_fn.h" @@ -54,7 +64,7 @@ #include "pgxc/pgxcnode.h" #include "access/gtm.h" #include "utils/lsyscache.h" -#include "optimizer/pgxcplan.h" +#include "pgxc/planner.h" #include "tcop/tcopprot.h" #include "nodes/nodes.h" #include "pgxc/poolmgr.h" @@ -90,14 +100,19 @@ static Query *transformCreateTableAsStmt(ParseState *pstate, CreateTableAsStmt *stmt); #ifdef PGXC static Query *transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt); +#ifndef XCP static bool IsExecDirectUtilityStmt(Node *node); static bool is_relation_child(RangeTblEntry *child_rte, List *rtable); static bool is_rel_child_of_rel(RangeTblEntry *child_rte, RangeTblEntry *parent_rte); #endif +#endif static void transformLockingClause(ParseState *pstate, Query *qry, LockingClause *lc, bool pushedDown); +#ifdef XCP +static void ParseAnalyze_rtable_walk(List *rtable); +#endif /* * parse_analyze @@ -549,8 +564,10 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) ParseState *sub_pstate = make_parsestate(pstate); Query *selectQuery; #ifdef PGXC +#ifndef XCP RangeTblEntry *target_rte; #endif +#endif /* * Process the source SELECT. @@ -584,6 +601,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) makeAlias("*SELECT*", NIL), false); #ifdef PGXC +#ifndef XCP /* * For an INSERT SELECT involving INSERT on a child after scanning * the parent, set flag to send command ID communication to remote @@ -599,6 +617,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) } } #endif +#endif rtr = makeNode(RangeTblRef); /* assume new rte is at end */ rtr->rtindex = list_length(pstate->p_rtable); @@ -2350,7 +2369,9 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt) List *raw_parsetree_list; ListCell *raw_parsetree_item; char *nodename; +#ifndef XCP Oid nodeoid; +#endif int nodeIndex; char nodetype; @@ -2370,6 +2391,15 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt) /* There is a single element here */ nodename = strVal(linitial(nodelist)); +#ifdef XCP + nodetype = PGXC_NODE_NONE; + nodeIndex = PGXCNodeGetNodeIdFromName(nodename, &nodetype); + if (nodetype == PGXC_NODE_NONE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("PGXC Node %s: object not defined", + nodename))); +#else nodeoid = get_pgxc_nodeoid(nodename); if (!OidIsValid(nodeoid)) @@ -2381,6 +2411,7 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt) /* Get node type and index */ nodetype = get_pgxc_nodetype(nodeoid); nodeIndex = PGXCNodeGetNodeId(nodeoid, get_pgxc_nodetype(nodeoid)); +#endif /* Check if node is requested is the self-node or not */ if (nodetype == PGXC_NODE_COORDINATOR && nodeIndex == PGXCNodeId - 1) @@ -2405,13 +2436,16 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt) result = parse_analyze(parsetree, query, NULL, 0); } +#ifndef XCP /* Needed by planner */ result->sql_statement = pstrdup(query); +#endif /* Default list of parameters to set */ step->sql_statement = NULL; step->exec_nodes = makeNode(ExecNodes); step->combine_type = COMBINE_TYPE_NONE; + step->sort = NULL; step->read_only = true; step->force_autocommit = false; step->cursor = NULL; @@ -2423,7 +2457,17 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt) else step->exec_type = EXEC_ON_DATANODES; + step->reduce_level = 0; step->base_tlist = NIL; + step->outer_alias = NULL; + step->inner_alias = NULL; + step->outer_reduce_level = 0; + step->inner_reduce_level = 0; + step->outer_relids = NULL; + step->inner_relids = NULL; + step->inner_statement = NULL; + step->outer_statement = NULL; + step->join_condition = NULL; /* Change the list of nodes that will be executed for the query and others */ step->force_autocommit = false; @@ -2463,14 +2507,15 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt) } } +#ifndef XCP /* * Features not yet supported * DML can be launched without errors but this could compromise data * consistency, so block it. */ - if (!xc_maintenance_mode && (step->exec_direct_type == EXEC_DIRECT_DELETE - || step->exec_direct_type == EXEC_DIRECT_UPDATE - || step->exec_direct_type == EXEC_DIRECT_INSERT)) + if (step->exec_direct_type == EXEC_DIRECT_DELETE + || step->exec_direct_type == EXEC_DIRECT_UPDATE + || step->exec_direct_type == EXEC_DIRECT_INSERT) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("EXECUTE DIRECT cannot execute DML queries"))); @@ -2488,18 +2533,22 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("EXECUTE DIRECT cannot execute locally this utility query"))); } +#endif /* Build Execute Node list, there is a unique node for the time being */ step->exec_nodes->nodeList = lappend_int(step->exec_nodes->nodeList, nodeIndex); /* Associate newly-created RemoteQuery node to the returned Query result */ +#ifndef XCP result->is_local = is_local; +#endif if (!is_local) result->utilityStmt = (Node *) step; return result; } +#ifndef XCP /* * Check if given node is authorized to go through EXECUTE DURECT */ @@ -2615,6 +2664,7 @@ is_rel_child_of_rel(RangeTblEntry *child_rte, RangeTblEntry *parent_rte) } #endif +#endif /* * Check for features that are not supported together with FOR UPDATE/SHARE. @@ -2848,3 +2898,76 @@ applyLockingClause(Query *qry, Index rtindex, rc->pushedDown = pushedDown; qry->rowMarks = lappend(qry->rowMarks, rc); } + +#ifdef XCP +/* + * Check if the query contains references to any pg_catalog tables that should + * be remapped to storm_catalog. The list is obtained from the + * storm_catalog_remap_string GUC. Also do this only for normal users + */ +void +ParseAnalyze_callback(ParseState *pstate, Query *query) +{ + ParseAnalyze_rtable_walk(query->rtable); +} + +static void +ParseAnalyze_rtable_walk(List *rtable) +{ + ListCell *item; + StringInfoData buf; + + if (!IsUnderPostmaster || superuser()) + return; + + initStringInfo(&buf); + foreach(item, rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(item); + + resetStringInfo(&buf); + if (rte->rtekind == RTE_FUNCTION && + get_func_namespace(((FuncExpr *) rte->funcexpr)->funcid) == + PG_CATALOG_NAMESPACE) + { + Oid funcid = InvalidOid; + + FuncExpr *funcexpr = (FuncExpr *) rte->funcexpr; + const char *funcname = get_func_name(funcexpr->funcid); + + /* Check if the funcname is in storm_catalog_remap_string */ + appendStringInfoString(&buf, funcname); + appendStringInfoChar(&buf, ','); + + elog(DEBUG2, "the constructed name is %s", buf.data); + + /* + * The unqualified function name should be satisfied from the + * storm_catalog appropriately. Just provide a warning for now if + * it is not.. + */ + if (strstr(storm_catalog_remap_string, buf.data)) + { + Oid *argtypes = NULL; + int nargs; + + get_func_signature(funcexpr->funcid, &argtypes, &nargs); + funcid = get_funcid(funcname, buildoidvector(argtypes, nargs), + STORM_CATALOG_NAMESPACE); + } + else + continue; + + if (get_func_namespace(funcid) != STORM_CATALOG_NAMESPACE) + ereport(WARNING, + (errcode(ERRCODE_INVALID_TABLE_DEFINITION), + errmsg("Entry (%s) present in storm_catalog_remap_string " + "but object not picked from STORM_CATALOG", funcname))); + else /* change the funcid to the storm_catalog one */ + funcexpr->funcid = funcid; + } + else if (rte->rtekind == RTE_SUBQUERY) /* recurse for subqueries */ + ParseAnalyze_rtable_walk(rte->subquery->rtable); + } +} +#endif diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 718de74092..d8d64c4d21 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -234,7 +234,7 @@ static void processCASbits(int cas_bits, int location, const char *constrType, DeallocateStmt PrepareStmt ExecuteStmt DropOwnedStmt ReassignOwnedStmt AlterTSConfigurationStmt AlterTSDictionaryStmt - BarrierStmt AlterNodeStmt CreateNodeStmt DropNodeStmt + BarrierStmt PauseStmt AlterNodeStmt CreateNodeStmt DropNodeStmt CreateNodeGroupStmt DropNodeGroupStmt %type <node> select_no_parens select_with_parens select_clause @@ -367,6 +367,7 @@ static void processCASbits(int cas_bits, int location, const char *constrType, %type <defelt> opt_binary opt_oids copy_delimiter %type <str> DirectStmt CleanConnDbName CleanConnUserName +%type <boolean> OptCluster /* PGXC_END */ %type <boolean> copy_from @@ -558,7 +559,7 @@ static void processCASbits(int cas_bits, int location, const char *constrType, OBJECT_P OF OFF OFFSET OIDS ON ONLY OPERATOR OPTION OPTIONS OR ORDER OUT_P OUTER_P OVER OVERLAPS OVERLAY OWNED OWNER - PARSER PARTIAL PARTITION PASSING PASSWORD PLACING PLANS POSITION + PARSER PARTIAL PARTITION PASSING PASSWORD PAUSE PLACING PLANS POSITION /* PGXC_BEGIN */ PRECEDING PRECISION PREFERRED PRESERVE PREPARE PREPARED PRIMARY /* PGXC_END */ @@ -582,7 +583,7 @@ static void processCASbits(int cas_bits, int location, const char *constrType, TRUNCATE TRUSTED TYPE_P TYPES_P UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN UNLISTEN UNLOGGED - UNTIL UPDATE USER USING + UNPAUSE UNTIL UPDATE USER USING VACUUM VALID VALIDATE VALIDATOR VALUE_P VALUES VARCHAR VARIADIC VARYING VERBOSE VERSION_P VIEW VOLATILE @@ -793,6 +794,7 @@ stmt : | LoadStmt | LockStmt | NotifyStmt + | PauseStmt | PrepareStmt | ReassignOwnedStmt | ReindexStmt @@ -8428,6 +8430,20 @@ opt_name_list: /* PGXC_BEGIN */ +PauseStmt: PAUSE CLUSTER + { + PauseClusterStmt *n = makeNode(PauseClusterStmt); + n->pause = true; + $$ = (Node *)n; + } + | UNPAUSE CLUSTER + { + PauseClusterStmt *n = makeNode(PauseClusterStmt); + n->pause = false; + $$ = (Node *)n; + } + ; + BarrierStmt: CREATE BARRIER opt_barrier_id { BarrierStmt *n = makeNode(BarrierStmt); @@ -8489,7 +8505,7 @@ pgxcnode_list: /***************************************************************************** * * QUERY: - * ALTER NODE nodename WITH + * ALTER [CLUSTER] NODE nodename WITH * ( * [ TYPE = ('datanode' | 'coordinator'), ] * [ HOST = 'hostname', ] @@ -8498,13 +8514,17 @@ pgxcnode_list: * [ PREFERRED [ = boolean ], ] * ) * + * If CLUSTER is mentioned, the command is executed on all nodes. + * PS: We need to add this option on all other pertinent NODE ddl + * operations too!) *****************************************************************************/ -AlterNodeStmt: ALTER NODE pgxcnode_name OptWith +AlterNodeStmt: ALTER OptCluster NODE pgxcnode_name OptWith { AlterNodeStmt *n = makeNode(AlterNodeStmt); - n->node_name = $3; - n->options = $4; + n->cluster = $2; + n->node_name = $4; + n->options = $5; $$ = (Node *)n; } ; @@ -8555,6 +8575,10 @@ DropNodeGroupStmt: DROP NODE GROUP_P pgxcgroup_name } ; +OptCluster: CLUSTER { $$ = TRUE; } + | /* EMPTY */ { $$ = FALSE; } + ; + /* PGXC_END */ /***************************************************************************** @@ -12793,6 +12817,9 @@ unreserved_keyword: | PARTITION | PASSING | PASSWORD +/* PGXC_BEGIN */ + | PAUSE +/* PGXC_END */ | PLANS | PRECEDING /* PGXC_BEGIN */ @@ -12874,6 +12901,9 @@ unreserved_keyword: | UNKNOWN | UNLISTEN | UNLOGGED +/* PGXC_BEGIN */ + | UNPAUSE +/* PGXC_END */ | UNTIL | UPDATE | VACUUM diff --git a/src/backend/parser/parse_agg.c b/src/backend/parser/parse_agg.c index 652d423787..670d98c3a2 100644 --- a/src/backend/parser/parse_agg.c +++ b/src/backend/parser/parse_agg.c @@ -3,6 +3,11 @@ * parse_agg.c * handle aggregates and window functions in parser * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -79,8 +84,10 @@ transformAggregateCall(ParseState *pstate, Aggref *agg, int min_varlevel; ListCell *lc; #ifdef PGXC +#ifndef XCP HeapTuple aggTuple; Form_pg_aggregate aggform; +#endif /* XCP */ #endif /* PGXC */ /* @@ -192,6 +199,7 @@ transformAggregateCall(ParseState *pstate, Aggref *agg, pstate = pstate->parentParseState; pstate->p_hasAggs = true; #ifdef PGXC +#ifndef XCP /* * Return data type of PGXC Datanode's aggregate should always return the * result of transition function, that is expected by collection function @@ -213,6 +221,7 @@ transformAggregateCall(ParseState *pstate, Aggref *agg, ReleaseSysCache(aggTuple); #endif +#endif } /* @@ -751,11 +760,20 @@ void build_aggregate_fnexprs(Oid *agg_input_types, int agg_num_inputs, Oid agg_state_type, +#ifdef XCP + Oid agg_collect_type, +#endif Oid agg_result_type, Oid agg_input_collation, Oid transfn_oid, +#ifdef XCP + Oid collectfn_oid, +#endif Oid finalfn_oid, Expr **transfnexpr, +#ifdef XCP + Expr **collectfnexpr, +#endif Expr **finalfnexpr) { Param *argp; @@ -797,6 +815,40 @@ build_aggregate_fnexprs(Oid *agg_input_types, agg_input_collation, COERCE_DONTCARE); +#ifdef XCP + /* see if we have a collect function */ + if (OidIsValid(collectfn_oid)) + { + Param *argp2; + /* + * Build expr tree for collect function + */ + argp = makeNode(Param); + argp->paramkind = PARAM_EXEC; + argp->paramid = -1; + argp->paramtype = agg_collect_type; + argp->paramtypmod = -1; + argp->location = -1; + + argp2 = makeNode(Param); + argp2->paramkind = PARAM_EXEC; + argp2->paramid = -1; + argp2->paramtype = agg_state_type; + argp2->paramtypmod = -1; + argp2->location = -1; + args = list_make2(argp, argp2); + + *collectfnexpr = (Expr *) makeFuncExpr(collectfn_oid, + agg_collect_type, + args, + InvalidOid, + agg_input_collation, + COERCE_DONTCARE); + } + else + *collectfnexpr = NULL; +#endif + /* see if we have a final function */ if (!OidIsValid(finalfn_oid)) { @@ -810,6 +862,15 @@ build_aggregate_fnexprs(Oid *agg_input_types, argp = makeNode(Param); argp->paramkind = PARAM_EXEC; argp->paramid = -1; + /* + * When running Phase 2 of distributed aggregation we may have only + * transient and final functions defined. + */ +#ifdef XCP + if (OidIsValid(agg_collect_type)) + argp->paramtype = agg_collect_type; + else +#endif argp->paramtype = agg_state_type; argp->paramtypmod = -1; argp->paramcollid = agg_input_collation; diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c index a55f0d5548..4b4cc2cae6 100644 --- a/src/backend/parser/parse_relation.c +++ b/src/backend/parser/parse_relation.c @@ -3,6 +3,11 @@ * parse_relation.c * parser support routines dealing with relations * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -30,6 +35,13 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/syscache.h" +#ifdef XCP +#include "utils/guc.h" +#include "catalog/pg_statistic.h" +#include "catalog/pg_namespace.h" +#include "pgxc/pgxc.h" +#include "miscadmin.h" +#endif static RangeTblEntry *scanNameSpaceForRefname(ParseState *pstate, @@ -591,6 +603,25 @@ markRTEForSelectPriv(ParseState *pstate, RangeTblEntry *rte, if (rte->rtekind == RTE_RELATION) { +#ifdef XCP + /* + * Ugly workaround against permission check error when non-privileged + * user executes ANALYZE command. + * To update local statistics coordinator queries pg_statistic tables on + * datanodes, but these are not selectable by PUBLIC. It would be better + * to define view, but pg_statistic contains fields of anyarray pseudotype + * which is not allowed in view. + * So we just disable check for SELECT permission if query referring the + * pg_statistic table is parsed on datanodes. That might be a security hole, + * but fortunately any user query against pg_statistic would be parsed on + * coordinator, and permission check would take place; the only way to + * have arbitrary query parsed on datanode is EXECUTE DIRECT, it is only + * available for superuser. + */ + if (IS_PGXC_DATANODE && rte->relid == StatisticRelationId) + rte->requiredPerms = 0; + else +#endif /* Make sure the rel as a whole is marked for SELECT access */ rte->requiredPerms |= ACL_SELECT; /* Must offset the attnum to fit in a bitmapset */ @@ -902,11 +933,56 @@ addRangeTableEntry(ParseState *pstate, lockmode = isLockedRefname(pstate, refname) ? RowShareLock : AccessShareLock; rel = parserOpenTable(pstate, relation, lockmode); rte->relid = RelationGetRelid(rel); + +#ifdef XCP + if (IsUnderPostmaster && !superuser() && + get_rel_namespace(rte->relid) == PG_CATALOG_NAMESPACE) + { + Oid relid = InvalidOid; + const char *relname = get_rel_name(rte->relid); + StringInfoData buf; + + /* Check if the relname is in storm_catalog_remap_string */ + initStringInfo(&buf); + appendStringInfoString(&buf, relname); + appendStringInfoChar(&buf, ','); + + elog(DEBUG2, "the constructed name is %s", buf.data); + + /* + * The unqualified relation name should be satisfied from the + * storm_catalog appropriately. Just provide a warning for now if + * it is not.. + */ + if (strstr(storm_catalog_remap_string, buf.data)) + { + relid = RelnameGetRelid((const char *)relname); + + if (get_rel_namespace(relid) != STORM_CATALOG_NAMESPACE) + ereport(WARNING, + (errcode(ERRCODE_INVALID_TABLE_DEFINITION), + errmsg("Entry (%s) present in storm_catalog_remap_string " + "but object not picked from STORM_CATALOG",relname))); + else + { + + /* close the existing relation and open the new one */ + heap_close(rel, NoLock); + + rel = relation_open(relid, NoLock); + rte->relid = RelationGetRelid(rel); + } + } + } +#endif + rte->relkind = rel->rd_rel->relkind; #ifdef PGXC +#ifndef XCP rte->relname = RelationGetRelationName(rel); #endif +#endif /* * Build the list of effective column names using user-supplied aliases @@ -935,6 +1011,25 @@ addRangeTableEntry(ParseState *pstate, rte->inh = inh; rte->inFromCl = inFromCl; +#ifdef XCP + /* + * Ugly workaround against permission check error when non-privileged + * user executes ANALYZE command. + * To update local statistics coordinator queries pg_statistic tables on + * datanodes, but these are not selectable by PUBLIC. It would be better + * to define view, but pg_statistic contains fields of anyarray pseudotype + * which is not allowed in view. + * So we just disable check for SELECT permission if query referring the + * pg_statistic table is parsed on datanodes. That might be a security hole, + * but fortunately any user query against pg_statistic would be parsed on + * coordinator, and permission check would take place; the only way to + * have arbitrary query parsed on datanode is EXECUTE DIRECT, it is only + * available for superuser. + */ + if (IS_PGXC_DATANODE && rte->relid == StatisticRelationId) + rte->requiredPerms = 0; + else +#endif rte->requiredPerms = ACL_SELECT; rte->checkAsUser = InvalidOid; /* not set-uid by default, either */ rte->selectedCols = NULL; @@ -972,8 +1067,10 @@ addRangeTableEntryForRelation(ParseState *pstate, rte->relkind = rel->rd_rel->relkind; #ifdef PGXC +#ifndef XCP rte->relname = RelationGetRelationName(rel); #endif +#endif /* * Build the list of effective column names using user-supplied aliases @@ -1421,6 +1518,15 @@ addRangeTableEntryForCTE(ParseState *pstate, errmsg("WITH query \"%s\" does not have a RETURNING clause", cte->ctename), parser_errposition(pstate, rv->location))); + +#ifdef PGXC +#ifndef XCP + if (ctequery->returningList != NIL) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("RETURNING clause not yet supported")))); +#endif +#endif } rte->ctecoltypes = cte->ctecoltypes; diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index b8ebf9b52d..ec00730eec 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -16,6 +16,11 @@ * a quick copyObject() call before manipulating the query tree. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -37,6 +42,9 @@ #include "catalog/pg_opclass.h" #include "catalog/pg_operator.h" #include "catalog/pg_type.h" +#ifdef XCP +#include "catalog/pgxc_node.h" +#endif #include "commands/comment.h" #include "commands/defrem.h" #include "commands/tablecmds.h" @@ -53,10 +61,9 @@ #include "parser/parse_type.h" #include "parser/parse_utilcmd.h" #ifdef PGXC -#include "optimizer/pgxcship.h" #include "pgxc/locator.h" #include "pgxc/pgxc.h" -#include "optimizer/pgxcplan.h" +#include "pgxc/planner.h" #include "pgxc/execRemote.h" #endif #include "parser/parser.h" @@ -68,6 +75,19 @@ #include "utils/syscache.h" #include "utils/typcache.h" +#ifdef XCP +/* + * Sources to make decision about distribution column, in order of preceedence + */ +typedef enum +{ + FBS_NONE, /* no fallback columns */ + FBS_COLDEF, /* column definition, if no constraints defined */ + FBS_UIDX, /* unique key definition, if no PK defined */ + FBS_PKEY, /* primary key definition */ + FBS_REPLICATE /* constraint definitions require to replicate table */ +} FallbackSrc; +#endif /* State shared by transformCreateStmt and its subroutines */ typedef struct @@ -90,7 +110,12 @@ typedef struct * the table */ IndexStmt *pkey; /* PRIMARY KEY index, if any */ #ifdef PGXC - char *fallback_dist_col; /* suggested column to distribute on */ +#ifdef XCP + FallbackSrc fallback_source; + List *fallback_dist_cols; +#else + char *fallback_dist_col; /* suggested column to distribute on */ +#endif DistributeBy *distributeby; /* original distribute by column of CREATE TABLE */ PGXCSubCluster *subcluster; /* original subcluster option of CREATE TABLE */ #endif @@ -110,6 +135,9 @@ typedef struct List *grants; /* GRANT items */ } CreateSchemaStmtContext; +#ifdef XCP +bool loose_constraints = false; +#endif static void transformColumnDefinition(CreateStmtContext *cxt, ColumnDef *column); @@ -134,6 +162,13 @@ static void transformConstraintAttrs(CreateStmtContext *cxt, List *constraintList); static void transformColumnType(CreateStmtContext *cxt, ColumnDef *column); static void setSchemaName(char *context_schema, char **stmt_schema_name); +#ifdef PGXC +static void checkLocalFKConstraints(CreateStmtContext *cxt); +#endif +#ifdef XCP +static List *transformSubclusterNodes(PGXCSubCluster *subcluster); +static PGXCSubCluster *makeSubCluster(List *nodelist); +#endif /* * transformCreateStmt - @@ -149,8 +184,14 @@ static void setSchemaName(char *context_schema, char **stmt_schema_name); * then expand those into multiple IndexStmt blocks. * - thomas 1997-12-02 */ +#ifdef XCP +List * +transformCreateStmt(CreateStmt *stmt, const char *queryString, + bool autodistribute) +#else List * transformCreateStmt(CreateStmt *stmt, const char *queryString) +#endif { ParseState *pstate; CreateStmtContext cxt; @@ -223,8 +264,14 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString) cxt.pkey = NULL; cxt.hasoids = interpretOidsOption(stmt->options); #ifdef PGXC +#ifdef XCP + cxt.fallback_source = FBS_NONE; + cxt.fallback_dist_cols = NIL; +#else cxt.fallback_dist_col = NULL; +#endif cxt.distributeby = stmt->distributeby; + cxt.subcluster = stmt->subcluster; #endif Assert(!stmt->ofTypename || !stmt->inhRelations); /* grammar enforces */ @@ -295,6 +342,90 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString) * If the user did not specify any distribution clause and there is no * inherits clause, try and use PK or unique index */ +#ifdef XCP + if (IS_PGXC_COORDINATOR && autodistribute && !stmt->distributeby) + { + /* always apply suggested subcluster */ + stmt->subcluster = copyObject(cxt.subcluster); + if (cxt.distributeby) + { + stmt->distributeby = copyObject(cxt.distributeby); + return result; + } + /* + * If constraints require replicated table set it replicated + */ + stmt->distributeby = makeNode(DistributeBy); + if (cxt.fallback_source == FBS_REPLICATE) + { + stmt->distributeby->disttype = DISTTYPE_REPLICATION; + stmt->distributeby->colname = NULL; + } + /* + * If there are parent tables ingerit distribution of the first parent + */ + else if (cxt.fallback_source < FBS_UIDX && stmt->inhRelations) + { + RangeVar *inh = (RangeVar *) linitial(stmt->inhRelations); + Relation rel; + + Assert(IsA(inh, RangeVar)); + rel = heap_openrv(inh, AccessShareLock); + if (rel->rd_rel->relkind != RELKIND_RELATION) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("inherited relation \"%s\" is not a table", + inh->relname))); + + if (rel->rd_locator_info) + { + switch (rel->rd_locator_info->locatorType) + { + case LOCATOR_TYPE_HASH: + stmt->distributeby->disttype = DISTTYPE_HASH; + stmt->distributeby->colname = + pstrdup(rel->rd_locator_info->partAttrName); + break; + case LOCATOR_TYPE_MODULO: + stmt->distributeby->disttype = DISTTYPE_MODULO; + stmt->distributeby->colname = + pstrdup(rel->rd_locator_info->partAttrName); + break; + case LOCATOR_TYPE_REPLICATED: + stmt->distributeby->disttype = DISTTYPE_REPLICATION; + break; + case LOCATOR_TYPE_RROBIN: + default: + stmt->distributeby->disttype = DISTTYPE_ROUNDROBIN; + break; + } + /* + * Use defined node, if nothing defined get from the parent + */ + if (stmt->subcluster == NULL) + stmt->subcluster = makeSubCluster(rel->rd_locator_info->nodeList); + } + heap_close(rel, NoLock); + } + /* + * If there are columns suitable for hash distribution distribute on + * first of them. + */ + else if (cxt.fallback_dist_cols) + { + stmt->distributeby->disttype = DISTTYPE_HASH; + stmt->distributeby->colname = (char *) linitial(cxt.fallback_dist_cols); + } + /* + * If none of above applies distribute by round robin + */ + else + { + stmt->distributeby->disttype = DISTTYPE_ROUNDROBIN; + stmt->distributeby->colname = NULL; + } + } +#else if (!stmt->distributeby && !stmt->inhRelations && cxt.fallback_dist_col) { stmt->distributeby = (DistributeBy *) palloc0(sizeof(DistributeBy)); @@ -302,6 +433,7 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString) stmt->distributeby->colname = cxt.fallback_dist_col; } #endif +#endif return result; } @@ -689,6 +821,7 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla cancel_parser_errposition_callback(&pcbstate); #ifdef PGXC +#ifndef XCP /* * Check if relation is temporary and assign correct flag. * This will override transaction direct commit as no 2PC @@ -696,6 +829,7 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla */ if (IsTempTable(RelationGetRelid(relation))) ExecSetTempObjectIncluded(); +#endif /* * Block the creation of tables using views in their LIKE clause. @@ -710,7 +844,11 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla if (relation->rd_rel->relkind == RELKIND_VIEW) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), +#ifdef XCP + errmsg("Postgres-XL does not support VIEW in LIKE clauses"), +#else errmsg("Postgres-XC does not support VIEW in LIKE clauses"), +#endif errdetail("The feature is not currently supported"))); #endif @@ -779,6 +917,21 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla */ cxt->columns = lappend(cxt->columns, def); +#ifdef XCP + /* + * If the distribution is not defined yet by a priority source add it + * to the list of possible fallbacks + */ + if (IS_PGXC_COORDINATOR && cxt->distributeby == NULL && !cxt->isalter && + cxt->fallback_source <= FBS_COLDEF && + IsTypeHashDistributable(attribute->atttypid)) + { + cxt->fallback_dist_cols = lappend(cxt->fallback_dist_cols, + pstrdup(attributeName)); + cxt->fallback_source = FBS_COLDEF; + } +#endif + /* * Copy default, if present and the default has been requested */ @@ -1473,6 +1626,12 @@ static IndexStmt * transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) { IndexStmt *index; +#ifdef PGXC + bool isLocalSafe = false; +#endif +#ifdef XCP + List *fallback_cols = NIL; +#endif ListCell *lc; index = makeNode(IndexStmt); @@ -1735,6 +1894,24 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) if (strcmp(column->colname, key) == 0) { found = true; + +#ifdef PGXC +#ifndef XCP + /* + * Only allow locally enforceable constraints. + * See if it is a distribution column + * If not set, set it to first column in index. + * If primary key, we prefer that over a unique constraint. + */ + if (IS_PGXC_COORDINATOR && !isLocalSafe) + { + if (cxt->distributeby) + isLocalSafe = CheckLocalIndexColumn ( + ConvertToLocatorType(cxt->distributeby->disttype), + cxt->distributeby->colname, key); + } +#endif +#endif break; } } @@ -1781,6 +1958,25 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) if (strcmp(key, inhname) == 0) { found = true; +#ifdef XCP + /* + * We should add the column to the fallback list now, + * so it could be found there, because inherited + * columns are not normally added. + * Do not modify the list if it is set from a priority + * source. + */ + if (IS_PGXC_COORDINATOR && + cxt->distributeby == NULL && !cxt->isalter && + cxt->fallback_source <= FBS_COLDEF && + IsTypeHashDistributable(inhattr->atttypid)) + { + cxt->fallback_dist_cols = + lappend(cxt->fallback_dist_cols, + pstrdup(inhname)); + cxt->fallback_source = FBS_COLDEF; + } +#endif /* * We currently have no easy way to force an inherited @@ -1833,14 +2029,64 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) #ifdef PGXC if (IS_PGXC_COORDINATOR) { +#ifdef XCP + /* + * Check if index can be enforced locally + */ + if (!isLocalSafe) + { + ListCell *lc; + /* + * If distribution is defined check current column against + * the distribution. + */ + if (cxt->distributeby) + isLocalSafe = CheckLocalIndexColumn ( + ConvertToLocatorType(cxt->distributeby->disttype), + cxt->distributeby->colname, key); + /* + * Similar, if altering existing table check against target + * table distribution + */ + if (cxt->isalter) + isLocalSafe = cxt->rel->rd_locator_info == NULL || + CheckLocalIndexColumn ( + cxt->rel->rd_locator_info->locatorType, + cxt->rel->rd_locator_info->partAttrName, + key); + + /* + * Check if it is possible to distribute table by this column + * If yes, save it, and replace the fallback list when done + */ + foreach (lc, cxt->fallback_dist_cols) + { + char *col = (char *) lfirst(lc); + + if (strcmp(key, col) == 0) + { + fallback_cols = lappend(fallback_cols, pstrdup(key)); + break; + } + } + } +#else /* * Set fallback distribution column. * If not set, set it to first column in index. * If primary key, we prefer that over a unique constraint. */ - if (index->indexParams == NIL && - (index->primary || !cxt->fallback_dist_col)) + if (index->indexParams == NIL + && (index->primary || !cxt->fallback_dist_col)) + { cxt->fallback_dist_col = pstrdup(key); + } + + /* Existing table, check if it is safe */ + if (cxt->isalter && !cxt->distributeby && !isLocalSafe) + isLocalSafe = CheckLocalIndexColumn ( + cxt->rel->rd_locator_info->locatorType, cxt->rel->rd_locator_info->partAttrName, key); +#endif } #endif @@ -1855,6 +2101,61 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) iparam->nulls_ordering = SORTBY_NULLS_DEFAULT; index->indexParams = lappend(index->indexParams, iparam); } +#ifdef PGXC +#ifdef XCP + if (IS_PGXC_COORDINATOR && !isLocalSafe) + { + if (cxt->distributeby || cxt->isalter) + { + /* + * Index is not safe for defined distribution; since for replicated + * distribution any index is safe and for round robin none, but + * this case bombs out immediately, so that is incompatible + * HASH or MODULO. Report the problem. + */ + if (loose_constraints && cxt->isalter && index->unique) + ereport(WARNING, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("Unique index of partitioned table must contain the" + " hash distribution column."))); + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("Unique index of partitioned table must contain the" + " hash distribution column."))); + } + else + { + if (fallback_cols) + { + list_free_deep(cxt->fallback_dist_cols); + cxt->fallback_dist_cols = fallback_cols; + if (index->primary) + cxt->fallback_source = FBS_PKEY; + else if (cxt->fallback_source < FBS_PKEY) + cxt->fallback_source = FBS_UIDX; + } + else + { + if (cxt->fallback_dist_cols) + { + list_free_deep(cxt->fallback_dist_cols); + cxt->fallback_dist_cols = NIL; + } + cxt->fallback_source = FBS_REPLICATE; + } + } + } +#else + if (IS_PGXC_COORDINATOR && cxt->distributeby + && (cxt->distributeby->disttype == DISTTYPE_HASH || + cxt->distributeby->disttype == DISTTYPE_MODULO) + && !isLocalSafe) + ereport(ERROR, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("Unique index of partitioned table must contain the hash distribution column."))); +#endif +#endif return index; } @@ -1872,6 +2173,25 @@ transformFKConstraints(CreateStmtContext *cxt, if (cxt->fkconstraints == NIL) return; +#ifdef XCP + /* + * If the "loose_constraints" GUC is set, we wholesale avoid creating + * Foreign Keys. Another way is to identify only those unenforceable + * FK constraints and skip over those. However the query string sent to + * the datanodes still contains those FKs and messes up things later. + * This can be handled by re-generating the query string that should be + * passed onto the datanodes, but that's quite a lot of work. + * + * Also supporting some FKs and not some others is also debatable.. + * So we go in for an all-or-nothing approach here + */ + if (loose_constraints) + { + list_free_deep(cxt->fkconstraints); + cxt->fkconstraints = NIL; + return; + } +#endif /* * If CREATE TABLE or adding a column with NULL default, we can safely * skip validation of FK constraints, and nonetheless mark them valid. @@ -1886,21 +2206,19 @@ transformFKConstraints(CreateStmtContext *cxt, constraint->skip_validation = true; constraint->initially_valid = true; #ifdef PGXC +#ifndef XCP /* * Set fallback distribution column. * If not yet set, set it to first column in FK constraint * if it references a partitioned table */ - if (IS_PGXC_COORDINATOR && - !cxt->fallback_dist_col && - list_length(constraint->pk_attrs) != 0) + if (IS_PGXC_COORDINATOR && !cxt->fallback_dist_col) { Oid pk_rel_id = RangeVarGetRelid(constraint->pktable, NoLock, false); - AttrNumber attnum = get_attnum(pk_rel_id, - strVal(list_nth(constraint->fk_attrs, 0))); - /* Make sure key is done on a partitioned column */ - if (IsDistribColumn(pk_rel_id, attnum)) + /* make sure it is a partitioned column */ + if (list_length(constraint->pk_attrs) != 0 + && IsHashColumnForRelId(pk_rel_id, strVal(list_nth(constraint->pk_attrs,0)))) { /* take first column */ char *colstr = strdup(strVal(list_nth(constraint->fk_attrs,0))); @@ -1908,9 +2226,16 @@ transformFKConstraints(CreateStmtContext *cxt, } } #endif +#endif } } +#ifdef PGXC + /* Only allow constraints that are locally enforceable - no distributed ones */ + if (IS_PGXC_COORDINATOR) + checkLocalFKConstraints(cxt); +#endif + /* * For CREATE TABLE or ALTER TABLE ADD COLUMN, gin up an ALTER TABLE ADD * CONSTRAINT command to execute after the basic command is complete. (If @@ -2434,7 +2759,12 @@ transformAlterTableStmt(AlterTableStmt *stmt, const char *queryString) cxt.alist = NIL; cxt.pkey = NULL; #ifdef PGXC +#ifdef XCP + cxt.fallback_source = FBS_NONE; + cxt.fallback_dist_cols = NIL; +#else cxt.fallback_dist_col = NULL; +#endif cxt.distributeby = NULL; cxt.subcluster = NULL; #endif @@ -2721,6 +3051,20 @@ transformColumnType(CreateStmtContext *cxt, ColumnDef *column) parser_errposition(cxt->pstate, column->collClause->location))); } +#ifdef XCP + /* + * If the distribution is not defined yet by a priority source add it to the + * list of possible fallbacks + */ + if (IS_PGXC_COORDINATOR && cxt->distributeby == NULL && !cxt->isalter && + cxt->fallback_source <= FBS_COLDEF && + IsTypeHashDistributable(HeapTupleGetOid(ctype))) + { + cxt->fallback_dist_cols = lappend(cxt->fallback_dist_cols, + pstrdup(column->colname)); + cxt->fallback_source = FBS_COLDEF; + } +#endif ReleaseSysCache(ctype); } @@ -2866,3 +3210,600 @@ setSchemaName(char *context_schema, char **stmt_schema_name) "different from the one being created (%s)", *stmt_schema_name, context_schema))); } + +#ifdef PGXC +/* + * CheckLocalIndexColumn + * + * Checks whether or not the index can be safely enforced locally + */ +bool +CheckLocalIndexColumn (char loctype, char *partcolname, char *indexcolname) +{ +#ifdef XCP + if (IsLocatorReplicated(loctype)) +#else + if (loctype == LOCATOR_TYPE_REPLICATED) +#endif + /* always safe */ + return true; + if (loctype == LOCATOR_TYPE_RROBIN) + ereport(ERROR, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("Cannot locally enforce a unique index on round robin distributed table."))); + else if (loctype == LOCATOR_TYPE_HASH || loctype == LOCATOR_TYPE_MODULO) + { + if (partcolname && indexcolname && strcmp(partcolname, indexcolname) == 0) + return true; + } + return false; +} + + +#ifdef XCP +/* + * Given relation, find the index of the attribute in the primary key, + * which is the distribution key. Returns -1 if table is not a Hash/Modulo + * distributed, does not have a primary key or distribution key is not in the + * primary key (last should not happen). + */ +static int +find_relation_pk_dist_index(Relation rel) +{ + int result = -1; + List *indexoidlist; + ListCell *indexoidscan; + int partAttNum = InvalidAttrNumber; + bool pk_found = false; + + if (rel->rd_locator_info) + partAttNum = rel->rd_locator_info->partAttrNum; + + if (partAttNum == InvalidAttrNumber) + return -1; + + /* + * Look up the primary key + */ + indexoidlist = RelationGetIndexList(rel); + + foreach(indexoidscan, indexoidlist) + { + Oid indexoid = lfirst_oid(indexoidscan); + HeapTuple indexTuple; + Form_pg_index indexForm; + + indexTuple = SearchSysCache1(INDEXRELID, + ObjectIdGetDatum(indexoid)); + if (!HeapTupleIsValid(indexTuple)) /* should not happen */ + elog(ERROR, "cache lookup failed for index %u", indexoid); + indexForm = ((Form_pg_index) GETSTRUCT(indexTuple)); + if (indexForm->indisprimary) + { + int i; + + pk_found = true; + + /* + * Loop over index attributes to find + * the distribution key + */ + for (i = 0; i < indexForm->indnatts; i++) + { + if (indexForm->indkey.values[i] == partAttNum) + { + result = i; + break; + } + } + } + ReleaseSysCache(indexTuple); + if (pk_found) + break; + } + + list_free(indexoidlist); + + return result; +} +#endif + + +/* + * check to see if the constraint can be enforced locally + * if not, an error will be thrown + */ +static void +checkLocalFKConstraints(CreateStmtContext *cxt) +{ + ListCell *fkclist; +#ifdef XCP + List *nodelist = NIL; + + if (cxt->subcluster) + nodelist = transformSubclusterNodes(cxt->subcluster); +#endif + foreach(fkclist, cxt->fkconstraints) + { + Constraint *constraint; + Oid pk_rel_id; +#ifdef XCP + RelationLocInfo *rel_loc_info; +#else + char refloctype; + char *checkcolname = NULL; +#endif + constraint = (Constraint *) lfirst(fkclist); + + /* + * If constraint references to the table itself, it is safe + * Check if relation name is the same + * XCTODO: NO! It is only safe if table is replicated + * or distributed on primary key + */ + if (constraint->pktable && + strcmp(constraint->pktable->relname,cxt->relation->relname) == 0) + { + /* Is namespace also the same ? */ + char *fkcon_schemaname = NULL; + + if (!cxt->relation->schemaname && + !constraint->pktable->schemaname) + continue; + + if (!constraint->pktable->schemaname) + { + /* Schema name is not defined, look for current one */ + List *search_path = fetch_search_path(false); + fkcon_schemaname = get_namespace_name(linitial_oid(search_path)); + list_free(search_path); + } + else + fkcon_schemaname = constraint->pktable->schemaname; + + /* + * If schema name and relation name are the same, table + * references to itself, so constraint is safe + */ + if (fkcon_schemaname && + strcmp(fkcon_schemaname, + cxt->relation->schemaname) == 0) +#ifdef XCP + { + /* check if bad distribution is already defined */ + if ((cxt->distributeby && cxt->distributeby->disttype != DISTTYPE_REPLICATION) || + (cxt->isalter && cxt->rel->rd_locator_info != NULL && !IsLocatorReplicated(cxt->rel->rd_locator_info->locatorType))) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("only replicated table can reference itself"))); + /* Record that replication is required */ + cxt->fallback_source = FBS_REPLICATE; + if (cxt->fallback_dist_cols) + { + list_free_deep(cxt->fallback_dist_cols); + cxt->fallback_dist_cols = NULL; + } + continue; + } +#else + continue; +#endif + } + + pk_rel_id = RangeVarGetRelid(constraint->pktable, NoLock, false); +#ifdef XCP + rel_loc_info = GetRelationLocInfo(pk_rel_id); + /* If referenced table is replicated, the constraint is safe */ + if (rel_loc_info == NULL || IsLocatorReplicated(rel_loc_info->locatorType)) + { + List *common; + + if (cxt->subcluster) + { + /* + * Distribution nodes are defined, they must be a subset of + * the referenced relation's nodes + */ + common = list_intersection_int(nodelist, rel_loc_info->nodeList); + if (list_length(common) < list_length(nodelist)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("referenced table is not defined on all target nodes"))); + list_free(common); + } + else + { + /* suggest distribution */ + if (nodelist) + { + common = list_intersection_int(nodelist, rel_loc_info->nodeList); + if (list_length(common) == 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("referenced tables is defined on different nodes"))); + list_free(nodelist); + nodelist = common; + } + else + nodelist = rel_loc_info? list_copy(rel_loc_info->nodeList):NIL; + } + } + else if (rel_loc_info->locatorType == LOCATOR_TYPE_RROBIN) + { + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Cannot reference a round robin table in a foreign key constraint"))); + } + else if (IsLocatorDistributedByValue(rel_loc_info->locatorType)) + { + ListCell *fklc; + ListCell *pklc; + char ltype; + char *lattr; + bool found = false; + List *common; + + /* + * First check nodes, they must be the same as in + * the referenced relation + */ + if (cxt->subcluster) + { + common = list_intersection_int(nodelist, rel_loc_info->nodeList); + if (list_length(common) != list_length(rel_loc_info->nodeList) || + list_length(common) != list_length(nodelist)) + { + if (list_length(common) == 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("referenced HASH/MODULO table must be defined on same nodes"))); + } + list_free(common); + } + else + { + if (nodelist) + { + common = list_intersection_int(nodelist, rel_loc_info->nodeList); + if (list_length(common) != list_length(rel_loc_info->nodeList)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("referenced HASH/MODULO table must be defined on same nodes"))); + list_free(nodelist); + nodelist = common; + } + else + nodelist = list_copy(rel_loc_info->nodeList); + /* Now define the subcluster */ + cxt->subcluster = makeSubCluster(nodelist); + } + + if (cxt->distributeby) + { + ltype = ConvertToLocatorType(cxt->distributeby->disttype); + lattr = cxt->distributeby->colname; + } + else if (cxt->isalter) + { + if (cxt->rel->rd_locator_info == NULL) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Hash/Modulo distribution column does not refer" + " to hash/modulo distribution column in referenced table."))); + ltype = cxt->rel->rd_locator_info->locatorType; + lattr = cxt->rel->rd_locator_info->partAttrName; + } + else + { + /* + * Not defined distribution, but we can define now. + * The distribution must be the same as in referenced table, + * distribution keys must be matching fk/pk + */ + /* + * Can not define distribution by value already + */ + if (cxt->fallback_source == FBS_REPLICATE) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Hash/Modulo distribution column does not refer" + " to hash/modulo distribution column in referenced table."))); + /* find the fk attribute matching the distribution column */ + lattr = NULL; + if (list_length(constraint->pk_attrs) == 0) + { + /* + * PK attribute list may be missing, so FK must reference + * the primary table's primary key. The primary key may + * consist of multiple attributes, one of them is a + * distribution key. We should find the foreign attribute + * referencing that primary attribute and set it as the + * distribution key of the table. + */ + int pk_attr_idx; + Relation rel; + + rel = relation_open(pk_rel_id, AccessShareLock); + pk_attr_idx = find_relation_pk_dist_index(rel); + relation_close(rel, AccessShareLock); + + if (pk_attr_idx >= 0 && + pk_attr_idx < list_length(constraint->fk_attrs)) + { + lattr = strVal(list_nth(constraint->fk_attrs, pk_attr_idx)); + } + } + else + { + /* + * One of the primary attributes must be the primary + * tabble's distribution key. We should find the foreign + * attribute referencing that primary attribute and set it + * as the distribution key of the table. + */ + forboth(fklc, constraint->fk_attrs, + pklc, constraint->pk_attrs) + { + if (strcmp(rel_loc_info->partAttrName, + strVal(lfirst(pklc))) == 0) + { + lattr = strVal(lfirst(fklc)); + break; + } + } + } + /* distribution column is not referenced? */ + if (lattr == NULL) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Hash/Modulo distribution column does not refer" + " to hash/modulo distribution column in referenced table."))); + foreach(fklc, cxt->fallback_dist_cols) + { + if (strcmp(lattr, (char *) lfirst(fklc)) == 0) + { + found = true; + break; + } + } + if (found) + { + list_free_deep(cxt->fallback_dist_cols); + cxt->fallback_dist_cols = NIL; + cxt->fallback_source = FBS_NONE; + cxt->distributeby = makeNode(DistributeBy); + switch (rel_loc_info->locatorType) + { + case LOCATOR_TYPE_HASH: + cxt->distributeby->disttype = DISTTYPE_HASH; + cxt->distributeby->colname = pstrdup(lattr); + break; + case LOCATOR_TYPE_MODULO: + cxt->distributeby->disttype = DISTTYPE_MODULO; + cxt->distributeby->colname = pstrdup(lattr); + break; + default: + /* can not happen ?*/ + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Hash/Modulo distribution column does not refer" + " to hash/modulo distribution column in referenced table."))); + } + } + else /* dist attr is not found */ + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Hash/Modulo distribution column does not refer" + " to hash/modulo distribution column in referenced table."))); + continue; + } + /* + * Here determine if already defined distribution is matching + * to distribution of primary table. + */ + if (ltype != rel_loc_info->locatorType || lattr == NULL) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Hash/Modulo distribution column does not refer" + " to hash/modulo distribution column in referenced table."))); + if (list_length(constraint->pk_attrs) == 0) + { + /* + * PK attribute list may be missing, so FK must reference + * the primary table's primary key. The primary key may + * consist of multiple attributes, one of them is a + * distribution key. We should find the foreign attribute + * referencing that primary attribute and make sure it is a + * distribution key of the table. + */ + int pk_attr_idx; + Relation rel; + + rel = relation_open(pk_rel_id, AccessShareLock); + pk_attr_idx = find_relation_pk_dist_index(rel); + relation_close(rel, AccessShareLock); + + /* + * Two first conditions are just avoid assertion failure in + * list_nth. First should never happen, because the primary key + * of hash/modulo distributed table must contain distribution + * key. Second may only happen if list of foreign columns is + * shorter then the primary key. In that case statement would + * probably fail later, but no harm if it fails here. + */ + if (pk_attr_idx >= 0 && + pk_attr_idx < list_length(constraint->fk_attrs) && + strcmp(lattr, strVal(list_nth(constraint->fk_attrs, + pk_attr_idx))) == 0) + { + found = true; + } + } + else + { + forboth(fklc, constraint->fk_attrs, pklc, constraint->pk_attrs) + { + if (strcmp(lattr, strVal(lfirst(fklc))) == 0) + { + found = true; + if (strcmp(rel_loc_info->partAttrName, + strVal(lfirst(pklc))) == 0) + break; + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Hash/Modulo distribution column does not refer" + " to hash/modulo distribution column in referenced table."))); + } + } + } + if (!found) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Hash/Modulo distribution column does not refer" + " to hash/modulo distribution column in referenced table."))); + } + else /* Unsupported distribution */ + { + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Cannot reference a table with distribution type \"%c\"", + rel_loc_info->locatorType))); + } +#else + refloctype = GetLocatorType(pk_rel_id); + /* If referenced table is replicated, the constraint is safe */ + if (refloctype == LOCATOR_TYPE_REPLICATED) + continue; + else if (refloctype == LOCATOR_TYPE_RROBIN) + { + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Cannot reference a round robin table in a foreign key constraint"))); + } + /* + * See if we are hash or modulo partitioned and the column appears in the + * constraint, and it corresponds to the position in the referenced table. + */ + if (cxt->isalter) + { + if (cxt->rel->rd_locator_info->locatorType == LOCATOR_TYPE_HASH || + cxt->rel->rd_locator_info->locatorType == LOCATOR_TYPE_MODULO) + { + checkcolname = cxt->rel->rd_locator_info->partAttrName; + } + } + else + { + if (cxt->distributeby) + { + if (cxt->distributeby->disttype == DISTTYPE_HASH || + cxt->distributeby->disttype == DISTTYPE_MODULO) + checkcolname = cxt->distributeby->colname; + } + else + { + if (cxt->fallback_dist_col) + checkcolname = cxt->fallback_dist_col; + } + } + if (checkcolname) + { + int pos = 0; + + ListCell *attritem; + + foreach(attritem, constraint->fk_attrs) + { + char *attrname = (char *) strVal(lfirst(attritem)); + + if (strcmp(checkcolname, attrname) == 0) + { + /* Found the ordinal position in constraint */ + break; + } + pos++; + } + + if (pos >= list_length(constraint->fk_attrs)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Hash/Modulo distributed table must include distribution column in index"))); + + /* + * The check to make sure that the referenced column in pk table is the same + * as the one used to distribute it makes sense only when the user + * supplies the name of the referenced colum while adding the constraint + * because if the user did not specify it the system will choose the pk column + * which will obviously be the one used to distribute it knowing the + * existing constraints in XC + * This is required to make sure that both + * alter table dtab add foreign key (b) references rtab(a); + * and + * alter table dtab add foreign key (b) references rtab; + * behave similarly + */ + if (constraint->pk_attrs != NULL) + { + /* Verify that the referenced table is partitioned at the same position in the index */ + if (!IsDistColumnForRelId(pk_rel_id, strVal(list_nth(constraint->pk_attrs,pos)))) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Hash/Modulo distribution column does not refer to hash/modulo distribution column in referenced table."))); + } + } +#endif + } +#ifdef XCP + /* + * If presence of a foreign constraint suggested a set of nodes, fix it here + */ + if (nodelist && cxt->subcluster == NULL) + cxt->subcluster = makeSubCluster(nodelist); +#endif +} +#endif + + +#ifdef XCP +/* + * Convert SubCluster definition to a list of Datanode indexes, to compare to + * relation nodes + */ +static List * +transformSubclusterNodes(PGXCSubCluster *subcluster) +{ + List *result = NIL; + Oid *nodeoids; + int numnodes; + int i; + char nodetype = PGXC_NODE_DATANODE; + + nodeoids = GetRelationDistributionNodes(subcluster, &numnodes); + for (i = 0; i < numnodes; i++) + result = lappend_int(result, PGXCNodeGetNodeId(nodeoids[i], &nodetype)); + + return result; +} + + +/* + * Create a SubCluster definition from a list of node indexes. + */ +static PGXCSubCluster * +makeSubCluster(List *nodelist) +{ + PGXCSubCluster *result; + ListCell *lc; + result = makeNode(PGXCSubCluster); + result->clustertype = SUBCLUSTER_NODE; + foreach (lc, nodelist) + { + int nodeidx = lfirst_int(lc); + char *nodename = get_pgxc_nodename( + PGXCNodeGetNodeOid(nodeidx, PGXC_NODE_DATANODE)); + result->members = lappend(result->members, makeString(nodename)); + } + return result; +} +#endif diff --git a/src/backend/pgxc/Makefile b/src/backend/pgxc/Makefile index 1fe1c12d02..9786b3b16a 100644 --- a/src/backend/pgxc/Makefile +++ b/src/backend/pgxc/Makefile @@ -11,6 +11,6 @@ subdir = src/backend/pgxc top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = locator pool barrier nodemgr copy xc_maintenance_mode +SUBDIRS = locator plan pool barrier nodemgr squeue cluster copy xc_maintenance_mode include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/cluster/Makefile b/src/backend/pgxc/cluster/Makefile new file mode 100644 index 0000000000..85c1d493f0 --- /dev/null +++ b/src/backend/pgxc/cluster/Makefile @@ -0,0 +1,17 @@ +#------------------------------------------------------------------------- +# +# Makefile -- +# Makefile for cluster functionality +# +# IDENTIFICATION +# $PostgreSQL$ +# +#------------------------------------------------------------------------- + +subdir = src/backend/pgxc/cluster +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = pause.o stormutils.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/cluster/pause.c b/src/backend/pgxc/cluster/pause.c new file mode 100644 index 0000000000..ecac0950b0 --- /dev/null +++ b/src/backend/pgxc/cluster/pause.c @@ -0,0 +1,480 @@ +/*------------------------------------------------------------------------- + * + * pause.c + * + * Cluster Pause/Unpause handling + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ + +#ifdef XCP +#include "postgres.h" +#include "pgxc/execRemote.h" +#include "pgxc/pause.h" +#include "pgxc/pgxc.h" +#include "storage/spin.h" +#include "miscadmin.h" + +/* globals */ +bool cluster_lock_held; +bool cluster_ex_lock_held; + +static void HandleClusterPause(bool pause, bool initiator); +static void ProcessClusterPauseRequest(bool pause); + +ClusterLockInfo *ClustLinfo = NULL; + +/* + * ProcessClusterPauseRequest: + * + * Carry out PAUSE/UNPAUSE request on a coordinator node + */ +static void +ProcessClusterPauseRequest(bool pause) +{ + char *action = pause? "PAUSE":"UNPAUSE"; + + if (!IS_PGXC_COORDINATOR || !IsConnFromCoord()) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("The %s CLUSTER message is expected to " + "arrive at a coordinator from another coordinator", + action))); + + elog(DEBUG2, "Received %s CLUSTER from a coordinator", action); + + /* + * If calling UNPAUSE, ensure that the cluster lock has already been held + * in exclusive mode + */ + if (!pause && !cluster_ex_lock_held) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Received an UNPAUSE request when cluster not PAUSED!"))); + + /* + * Enable/Disable local queries. We need to release the lock first + * + * TODO: Think of some timeout mechanism here, if the locking takes too + * much time... + */ + ReleaseClusterLock(pause? false:true); + AcquireClusterLock(pause? true:false); + + if (pause) + cluster_ex_lock_held = true; + else + cluster_ex_lock_held = false; + + elog(DEBUG2, "%s queries at the coordinator", pause? "Paused":"Resumed"); + + return; +} + +/* + * HandleClusterPause: + * + * Any errors will be reported via ereport. + */ +static void +HandleClusterPause(bool pause, bool initiator) +{ + PGXCNodeAllHandles *coord_handles; + int conn; + int response; + char *action = pause? "PAUSE":"UNPAUSE"; + + elog(DEBUG2, "Preparing coordinators for %s CLUSTER", action); + + if (pause && cluster_ex_lock_held) + { + ereport(NOTICE, (errmsg("CLUSTER already PAUSED"))); + + /* Nothing to do */ + return; + } + + if (!pause && !cluster_ex_lock_held) + { + ereport(NOTICE, (errmsg("Issue PAUSE CLUSTER before calling UNPAUSE"))); + + /* Nothing to do */ + return; + } + + /* + * If we are one of the participating coordinators, just do the action + * locally and return + */ + if (!initiator) + { + ProcessClusterPauseRequest(pause); + return; + } + + /* + * Send a PAUSE/UNPAUSE CLUSTER message to all the coordinators. We should send an + * asyncronous request, update the local ClusterLock and then wait for the remote + * coordinators to respond back + */ + + coord_handles = get_handles(NIL, GetAllCoordNodes(), true); + + for (conn = 0; conn < coord_handles->co_conn_count; conn++) + { + PGXCNodeHandle *handle = coord_handles->coord_handles[conn]; + + if (pgxc_node_send_query(handle, pause? "PAUSE CLUSTER" : "UNPAUSE CLUSTER") != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send %s CLUSTER request to some coordinator nodes",action))); + } + + /* + * Disable/Enable local queries. We need to release the SHARED mode first + * + * TODO: Start a timer to cancel the request in case of a timeout + */ + ReleaseClusterLock(pause? false:true); + AcquireClusterLock(pause? true:false); + + if (pause) + cluster_ex_lock_held = true; + else + cluster_ex_lock_held = false; + + + elog(DEBUG2, "%s queries at the driving coordinator", pause? "Paused":"Resumed"); + + /* + * Local queries are paused/enabled. Check status of the remote coordinators + * now. We need a TRY/CATCH block here, so that if one of the coordinator + * fails for some reason, we can try best-effort to salvage the situation + * at others + * + * We hope that errors in the earlier loop generally do not occur (out of + * memory and improper handles..) or we can have a similar TRY/CATCH block + * there too + * + * To repeat: All the salvaging is best effort really... + */ + PG_TRY(); + { + ResponseCombiner combiner; + + InitResponseCombiner(&combiner, coord_handles->co_conn_count, COMBINE_TYPE_NONE); + for (conn = 0; conn < coord_handles->co_conn_count; conn++) + { + PGXCNodeHandle *handle; + + handle = coord_handles->coord_handles[conn]; + + while (true) + { + if (pgxc_node_receive(1, &handle, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to receive a response from the remote coordinator node"))); + + response = handle_response(handle, &combiner); + if (response == RESPONSE_EOF) + continue; + else if (response == RESPONSE_COMPLETE) + break; + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("%s CLUSTER command failed " + "with error %s", action, handle->error))); + } + } + + if (combiner.errorMessage) + { + char *code = combiner.errorCode; + if (combiner.errorDetail != NULL) + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", combiner.errorMessage), errdetail("%s", combiner.errorDetail) )); + else + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", combiner.errorMessage))); + } + + CloseCombiner(&combiner); + } + PG_CATCH(); + { + /* + * If PAUSE CLUSTER, issue UNPAUSE on the reachable nodes. For failure + * in cases of UNPAUSE, might need manual intervention at the offending + * coordinator node (maybe do a pg_cancel_backend() on the backend + * that's holding the exclusive lock or something..) + */ + if (!pause) + ereport(WARNING, + (errmsg("UNPAUSE CLUSTER command failed on one or more coordinator nodes." + " Manual intervention may be required!"))); + else + ereport(WARNING, + (errmsg("PAUSE CLUSTER command failed on one or more coordinator nodes." + " Trying to UNPAUSE reachable nodes now"))); + + for (conn = 0; conn < coord_handles->co_conn_count && pause; conn++) + { + PGXCNodeHandle *handle = coord_handles->coord_handles[conn]; + + (void) pgxc_node_send_query(handle, "UNPAUSE CLUSTER"); + + /* + * The incoming data should hopefully be discarded as part of + * cleanup.. + */ + } + + /* cleanup locally.. */ + ReleaseClusterLock(pause? true:false); + AcquireClusterLock(pause? false:true); + cluster_ex_lock_held = false; + PG_RE_THROW(); + } + PG_END_TRY(); + + elog(DEBUG2, "Successfully completed %s CLUSTER command on " + "all coordinator nodes", action); + + return; +} + +void +RequestClusterPause(bool pause, char *completionTag) +{ + char *action = pause? "PAUSE":"UNPAUSE"; + bool initiator = true; + + elog(DEBUG2, "%s CLUSTER request received", action); + + /* Only a superuser can perform this activity on a cluster */ + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("%s CLUSTER command: must be a superuser", action))); + + /* Ensure that we are a coordinator */ + if (!IS_PGXC_COORDINATOR) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("%s CLUSTER command must be sent to a coordinator", action))); + + /* + * Did the command come directly to this coordinator or via another + * coordinator? + */ + if (IsConnFromCoord()) + initiator = false; + + HandleClusterPause(pause, initiator); + + if (completionTag) + snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "%s CLUSTER", action); +} + +/* + * If the backend is shutting down, cleanup the PAUSE cluster lock + * appropriately. We do this before shutting down shmem, because this needs + * LWLock and stuff + */ +void +PGXCCleanClusterLock(int code, Datum arg) +{ + PGXCNodeAllHandles *coord_handles; + int conn; + + if (cluster_lock_held && !cluster_ex_lock_held) + { + ReleaseClusterLock (false); + cluster_lock_held = false; + } + + /* Do nothing if cluster lock not held */ + if (!cluster_ex_lock_held) + return; + + /* Do nothing if we are not the initiator */ + if (IsConnFromCoord()) + return; + + coord_handles = get_handles(NIL, GetAllCoordNodes(), true); + /* Try best-effort to UNPAUSE other coordinators now */ + for (conn = 0; conn < coord_handles->co_conn_count; conn++) + { + PGXCNodeHandle *handle = coord_handles->coord_handles[conn]; + + /* No error checking here... */ + (void)pgxc_node_send_query(handle, "UNPAUSE CLUSTER"); + } + + /* Release locally too. We do not want a dangling value in cl_holder_pid! */ + ReleaseClusterLock(true); + cluster_ex_lock_held = false; +} + +/* Report shared memory space needed by ClusterLockShmemInit */ +Size +ClusterLockShmemSize(void) +{ + Size size = 0; + + size = add_size(size, sizeof(ClusterLockInfo)); + + return size; +} + +/* Allocate and initialize cluster locking related shared memory */ +void +ClusterLockShmemInit(void) +{ + bool found; + + ClustLinfo = (ClusterLockInfo *) + ShmemInitStruct("Cluster Lock Info", ClusterLockShmemSize(), &found); + + if (!found) + { + /* First time through, so initialize */ + MemSet(ClustLinfo, 0, ClusterLockShmemSize()); + SpinLockInit(&ClustLinfo->cl_mutex); + } +} + +/* + * AcquireClusterLock + * + * Based on the argument passed in, try to update the shared memory + * appropriately. In case the conditions cannot be satisfied immediately this + * function resorts to a simple sleep. We don't envision PAUSE CLUSTER to + * occur that frequently so most of the calls will come out immediately here + * without any sleeps at all + * + * We could have used a semaphore to allow the processes to sleep while the + * cluster lock is held. But again we are really not worried about performance + * and immediate wakeups around PAUSE CLUSTER functionality. Using the sleep + * in an infinite loop keeps things simple yet correct + */ +void +AcquireClusterLock(bool exclusive) +{ + volatile ClusterLockInfo *clinfo = ClustLinfo; + + if (exclusive && cluster_ex_lock_held) + { + return; + } + + /* + * In the normal case, none of the backends will ask for exclusive lock, so + * they will just update the cl_process_count value and exit immediately + * from the below loop + */ + for (;;) + { + bool wait = false; + + SpinLockAcquire(&clinfo->cl_mutex); + + if (!exclusive) + { + if (clinfo->cl_holder_pid == 0) + clinfo->cl_process_count++; + else + wait = true; + } + else /* PAUSE CLUSTER handling */ + { + if (clinfo->cl_holder_pid != 0) + { + SpinLockRelease(&clinfo->cl_mutex); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("PAUSE CLUSTER already in progress"))); + } + + /* + * There should be no other process + * holding the lock including ourself + */ + if (clinfo->cl_process_count > 0) + wait = true; + else + clinfo->cl_holder_pid = MyProcPid; + } + SpinLockRelease(&clinfo->cl_mutex); + + /* + * We use a simple sleep mechanism. If PAUSE CLUSTER has been invoked, + * we are not worried about immediate performance characteristics.. + */ + if (wait) + { + CHECK_FOR_INTERRUPTS(); + pg_usleep(100000L); + } + else /* Got the proper semantic read/write lock.. */ + break; + } +} + +/* + * ReleaseClusterLock + * + * Update the shared memory appropriately across the release call. We + * really do not need the bool argument, but it's there for some + * additional sanity checking + */ +void +ReleaseClusterLock(bool exclusive) +{ + volatile ClusterLockInfo *clinfo = ClustLinfo; + + SpinLockAcquire(&clinfo->cl_mutex); + if (exclusive) + { + if (clinfo->cl_process_count > 1 || + clinfo->cl_holder_pid == 0) + { + SpinLockRelease(&clinfo->cl_mutex); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Inconsistent state while doing UNPAUSE CLUSTER"))); + } + + /* + * Reset the holder pid. Any waiters in AcquireClusterLock will + * eventually come out of their sleep and notice this new value and + * move ahead + */ + clinfo->cl_holder_pid = 0; + } + else + { + if (clinfo->cl_holder_pid != 0) + { + SpinLockRelease(&clinfo->cl_mutex); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Inconsistent state while releasing CLUSTER lock"))); + } + /* + * Decrement our count. If a PAUSE is waiting inside AcquireClusterLock + * elsewhere, it will wake out of sleep and do the needful + */ + if (clinfo->cl_process_count > 0); + clinfo->cl_process_count--; + } + SpinLockRelease(&clinfo->cl_mutex); +} +#endif diff --git a/src/backend/pgxc/cluster/stormutils.c b/src/backend/pgxc/cluster/stormutils.c new file mode 100644 index 0000000000..26b00d4ac5 --- /dev/null +++ b/src/backend/pgxc/cluster/stormutils.c @@ -0,0 +1,46 @@ +/*------------------------------------------------------------------------- + * + * stormutils.c + * + * Miscellaneous util functions + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ + +#ifdef XCP +#include "postgres.h" +#include "miscadmin.h" + +#include "utils/builtins.h" +#include "../interfaces/libpq/libpq-fe.h" +#include "commands/dbcommands.h" + +/* + * stormdb_promote_standby: + * + * Promote a standby into a regular backend by touching the trigger file. We + * cannot do it from outside via a normal shell script because this function + * needs to be called in context of the operation that is moving the node. + * Providing a function call provides some sense of transactional atomicity + */ +Datum +stormdb_promote_standby(PG_FUNCTION_ARGS) +{ + char trigger_file[MAXPGPATH]; + FILE *fp; + + snprintf(trigger_file, MAXPGPATH, "%s/stormdb.failover", DataDir); + + if ((fp = fopen(trigger_file, "w")) == NULL) + ereport(ERROR, + (errmsg("could not create trigger file"), + errdetail("The trigger file path was: %s", + trigger_file))); + fclose(fp); + + PG_RETURN_VOID(); +} +#endif diff --git a/src/backend/pgxc/copy/remotecopy.c b/src/backend/pgxc/copy/remotecopy.c index 016ea1425b..2422f25de8 100644 --- a/src/backend/pgxc/copy/remotecopy.c +++ b/src/backend/pgxc/copy/remotecopy.c @@ -3,6 +3,11 @@ * remotecopy.c * Implements an extension of COPY command for remote management * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2012, Postgres-XC Development Group * @@ -16,13 +21,16 @@ #include "postgres.h" #include "miscadmin.h" #include "lib/stringinfo.h" -#include "optimizer/pgxcship.h" #include "optimizer/planner.h" #include "pgxc/pgxcnode.h" +#include "pgxc/postgresql_fdw.h" #include "pgxc/remotecopy.h" #include "rewrite/rewriteHandler.h" #include "utils/builtins.h" #include "utils/rel.h" +#ifdef PGXC +#include "utils/lsyscache.h" +#endif static void RemoteCopy_QuoteStr(StringInfo query_buf, char *value); @@ -37,7 +45,11 @@ RemoteCopy_GetRelationLoc(RemoteCopyData *state, Relation rel, List *attnums) { +#ifndef XCP + ExecNodes *exec_nodes = makeNode(ExecNodes); +#else ExecNodes *exec_nodes = NULL; +#endif /* * If target table does not exists on nodes (e.g. system table) @@ -46,6 +58,23 @@ RemoteCopy_GetRelationLoc(RemoteCopyData *state, */ state->rel_loc = GetRelationLocInfo(RelationGetRelid(rel)); +#ifdef XCP + if (state->rel_loc && + AttributeNumberIsValid(state->rel_loc->partAttrNum)) + { + TupleDesc tdesc; + Form_pg_attribute pattr; + /* determine distribution column data type */ + tdesc = RelationGetDescr(rel); + + pattr = tdesc->attrs[state->rel_loc->partAttrNum - 1]; + state->dist_type = pattr->atttypid; + } + else + state->dist_type = InvalidOid; + + state->locator = NULL; +#else if (state->rel_loc) { /* @@ -55,7 +84,7 @@ RemoteCopy_GetRelationLoc(RemoteCopyData *state, */ exec_nodes = makeNode(ExecNodes); if (!state->is_from && - IsRelationReplicated(state->rel_loc)) + IsLocatorReplicated(state->rel_loc->locatorType)) exec_nodes->nodeList = GetPreferredReplicationNode(state->rel_loc->nodeList); else { @@ -96,6 +125,7 @@ RemoteCopy_GetRelationLoc(RemoteCopyData *state, /* Then save obtained result */ state->exec_nodes = exec_nodes; +#endif } /* @@ -119,8 +149,18 @@ RemoteCopy_BuildStatement(RemoteCopyData *state, */ initStringInfo(&state->query_buf); appendStringInfoString(&state->query_buf, "COPY "); - appendStringInfo(&state->query_buf, "%s", - quote_identifier(RelationGetRelationName(rel))); + + /* + * The table name should be qualified, unless the table is a temporary table + */ + if (rel->rd_backend == MyBackendId) + appendStringInfo(&state->query_buf, "%s", + quote_identifier(RelationGetRelationName(rel))); + else + appendStringInfo(&state->query_buf, "%s", + quote_qualified_identifier( + get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel))); if (attnamelist) { @@ -308,16 +348,19 @@ FreeRemoteCopyData(RemoteCopyData *state) /* Leave if nothing */ if (state == NULL) return; - +#ifdef XCP + if (state->locator) + freeLocator(state->locator); +#else if (state->connections) pfree(state->connections); +#endif if (state->query_buf.data) pfree(state->query_buf.data); FreeRelationLocInfo(state->rel_loc); pfree(state); } - #define APPENDSOFAR(query_buf, start, current) \ if (current > start) \ appendBinaryStringInfo(query_buf, start, current - start) diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index 432ae502be..ff531649cb 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -6,6 +6,11 @@ * * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -49,14 +54,63 @@ #include "catalog/pgxc_node.h" #include "catalog/namespace.h" #include "access/hash.h" +#ifdef XCP +#include "utils/date.h" +#include "utils/memutils.h" + +/* + * Locator details are private + */ +struct _Locator +{ + /* + * Determine target nodes for value. + * Resulting nodes are stored to the results array. + * Function returns number of node references written to the array. + */ + int (*locatefunc) (Locator *self, Datum value, bool isnull, + bool *hasprimary); + Oid dataType; /* values of that type are passed to locateNodes function */ + LocatorListType listType; + bool primary; + /* locator-specific data */ + /* XXX: move them into union ? */ + int roundRobinNode; /* for LOCATOR_TYPE_RROBIN */ + LocatorHashFunc hashfunc; /* for LOCATOR_TYPE_HASH */ + int valuelen; /* 1, 2 or 4 for LOCATOR_TYPE_MODULO */ + + int nodeCount; /* How many nodes are in the map */ + void *nodeMap; /* map index to node reference according to listType */ + void *results; /* array to output results */ +}; +#endif -static Expr *pgxc_find_distcol_expr(Index varno, AttrNumber attrNum, +#ifndef XCP +static Expr *pgxc_find_distcol_expr(Index varno, PartAttrNumber partAttrNum, Node *quals); +#endif Oid primary_data_node = InvalidOid; int num_preferred_data_nodes = 0; Oid preferred_data_node[MAX_PREFERRED_NODES]; +#ifdef XCP +static int modulo_value_len(Oid dataType); +static LocatorHashFunc hash_func_ptr(Oid dataType); +static int locate_static(Locator *self, Datum value, bool isnull, + bool *hasprimary); +static int locate_roundrobin(Locator *self, Datum value, bool isnull, + bool *hasprimary); +static int locate_hash_insert(Locator *self, Datum value, bool isnull, + bool *hasprimary); +static int locate_hash_select(Locator *self, Datum value, bool isnull, + bool *hasprimary); +static int locate_modulo_insert(Locator *self, Datum value, bool isnull, + bool *hasprimary); +static int locate_modulo_select(Locator *self, Datum value, bool isnull, + bool *hasprimary); +#endif + static const unsigned int xc_mod_m[] = { 0x00000000, 0x55555555, 0x33333333, 0xc71c71c7, @@ -120,6 +174,59 @@ static const unsigned int xc_mod_r[][6] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff} }; + +#ifdef XCP +/* + * GetAnyDataNode + * Pick any data node from given set, but try a preferred node + */ +int +GetAnyDataNode(Bitmapset *nodes) +{ + Bitmapset *preferred = NULL; + int i, nodeid; + int nmembers = 0; + int members[NumDataNodes]; + + for (i = 0; i < num_preferred_data_nodes; i++) + { + char ntype = PGXC_NODE_DATANODE; + nodeid = PGXCNodeGetNodeId(preferred_data_node[i], &ntype); + + /* OK, found one */ + if (bms_is_member(nodeid, nodes)) + preferred = bms_add_member(preferred, nodeid); + } + + /* + * If no preferred data nodes or they are not in the desired set, pick up + * from the original set. + */ + if (bms_is_empty(preferred)) + preferred = bms_copy(nodes); + + /* + * Load balance. + * We can not get item from the set, convert it to array + */ + while ((nodeid = bms_first_member(preferred)) >= 0) + members[nmembers++] = nodeid; + bms_free(preferred); + + /* If there is a single member nothing to balance */ + if (nmembers == 1) + return members[0]; + + /* + * In general, the set may contain any number of nodes, and if we save + * previous returned index for load balancing the distribution won't be + * flat, because small set will probably reset saved value, and lower + * indexes will be picked up more often. + * So we just get a random value from 0..nmembers-1. + */ + return members[((unsigned int) random()) % nmembers]; +} +#else /* * GetPreferredReplicationNode * Pick any Datanode from given list, however fetch a preferred node first. @@ -127,31 +234,39 @@ static const unsigned int xc_mod_r[][6] = List * GetPreferredReplicationNode(List *relNodes) { - ListCell *item; - int nodeid = -1; - - if (list_length(relNodes) <= 0) - elog(ERROR, "a list of nodes should have at least one node"); - - foreach(item, relNodes) + /* + * Try to find the first node in given list relNodes + * that is in the list of preferred nodes + */ + if (num_preferred_data_nodes != 0) { - int cnt_nodes; - for (cnt_nodes = 0; - cnt_nodes < num_preferred_data_nodes && nodeid < 0; - cnt_nodes++) + ListCell *item; + foreach(item, relNodes) { - if (PGXCNodeGetNodeId(preferred_data_node[cnt_nodes], - PGXC_NODE_DATANODE) == lfirst_int(item)) - nodeid = lfirst_int(item); + int relation_nodeid = lfirst_int(item); + int i; + for (i = 0; i < num_preferred_data_nodes; i++) + { +#ifdef XCP + char nodetype = PGXC_NODE_DATANODE; + int nodeid = PGXCNodeGetNodeId(preferred_data_node[i], + &nodetype); +#else + int nodeid = PGXCNodeGetNodeId(preferred_data_node[i], PGXC_NODE_DATANODE); +#endif + + /* OK, found one */ + if (nodeid == relation_nodeid) + return lappend_int(NULL, nodeid); + } } - if (nodeid >= 0) - break; } - if (nodeid < 0) - return list_make1_int(linitial_int(relNodes)); - return list_make1_int(nodeid); + /* Nothing found? Return the first one in relation node list */ + return lappend_int(NULL, linitial_int(relNodes)); } +#endif + /* * compute_modulo @@ -206,6 +321,7 @@ compute_modulo(unsigned int numerator, unsigned int denominator) return numerator % denominator; } +#ifndef XCP /* * get_node_from_modulo - determine node based on modulo * @@ -219,57 +335,157 @@ get_node_from_modulo(int modulo, List *nodeList) return list_nth_int(nodeList, modulo); } +#endif /* - * GetRelationDistribColumn - * Return hash column name for relation or NULL if relation is not distributed. + * GetRelationDistColumn - Returns the name of the hash or modulo distribution column + * First hash distribution is checked + * Retuens NULL if the table is neither hash nor modulo distributed */ char * -GetRelationDistribColumn(RelationLocInfo *locInfo) +GetRelationDistColumn(RelationLocInfo * rel_loc_info) { - /* No relation, so simply leave */ - if (!locInfo) - return NULL; +char *pColName; - /* No distribution column if relation is not distributed with a key */ - if (!IsRelationDistributedByValue(locInfo)) - return NULL; + pColName = NULL; - /* Return column name */ - return get_attname(locInfo->relid, locInfo->partAttrNum); + pColName = GetRelationHashColumn(rel_loc_info); + if (pColName == NULL) + pColName = GetRelationModuloColumn(rel_loc_info); + + return pColName; } +/* + * Returns whether or not the data type is hash distributable with PG-XC + * PGXCTODO - expand support for other data types! + */ +bool +IsTypeHashDistributable(Oid col_type) +{ +#ifdef XCP + return (hash_func_ptr(col_type) != NULL); +#else + if(col_type == INT8OID + || col_type == INT2OID + || col_type == OIDOID + || col_type == INT4OID + || col_type == BOOLOID + || col_type == CHAROID + || col_type == NAMEOID + || col_type == INT2VECTOROID + || col_type == TEXTOID + || col_type == OIDVECTOROID + || col_type == FLOAT4OID + || col_type == FLOAT8OID + || col_type == ABSTIMEOID + || col_type == RELTIMEOID + || col_type == CASHOID + || col_type == BPCHAROID + || col_type == BYTEAOID + || col_type == VARCHAROID + || col_type == DATEOID + || col_type == TIMEOID + || col_type == TIMESTAMPOID + || col_type == TIMESTAMPTZOID + || col_type == INTERVALOID + || col_type == TIMETZOID + || col_type == NUMERICOID + ) + return true; + + return false; +#endif +} /* - * IsDistribColumn - * Return whether column for relation is used for distribution or not. + * GetRelationHashColumn - return hash column for relation. + * + * Returns NULL if the relation is not hash partitioned. + */ +char * +GetRelationHashColumn(RelationLocInfo * rel_loc_info) +{ + char *column_str = NULL; + + if (rel_loc_info == NULL) + column_str = NULL; + else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH) + column_str = NULL; + else + { + int len = strlen(rel_loc_info->partAttrName); + + column_str = (char *) palloc(len + 1); + strncpy(column_str, rel_loc_info->partAttrName, len + 1); + } + + return column_str; +} + +/* + * IsHashColumn - return whether or not column for relation is hashed. + * */ bool -IsDistribColumn(Oid relid, AttrNumber attNum) +IsHashColumn(RelationLocInfo *rel_loc_info, char *part_col_name) { - RelationLocInfo *locInfo = GetRelationLocInfo(relid); + bool ret_value = false; - /* No locator info, so leave */ - if (!locInfo) - return false; + if (!rel_loc_info || !part_col_name) + ret_value = false; + else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH) + ret_value = false; + else + ret_value = !strcmp(part_col_name, rel_loc_info->partAttrName); - /* No distribution column if relation is not distributed with a key */ - if (!IsRelationDistributedByValue(locInfo)) - return false; + return ret_value; +} - /* Finally check if attribute is distributed */ - return locInfo->partAttrNum == attNum; + +/* + * IsHashColumnForRelId - return whether or not column for relation is hashed. + * + */ +bool +IsHashColumnForRelId(Oid relid, char *part_col_name) +{ + RelationLocInfo *rel_loc_info = GetRelationLocInfo(relid); + + return IsHashColumn(rel_loc_info, part_col_name); +} + +/* + * IsDistColumnForRelId - return whether or not column for relation is used for hash or modulo distribution + * + */ +bool +IsDistColumnForRelId(Oid relid, char *part_col_name) +{ + bool bRet; + RelationLocInfo *rel_loc_info; + + rel_loc_info = GetRelationLocInfo(relid); + bRet = false; + + bRet = IsHashColumn(rel_loc_info, part_col_name); + if (bRet == false) + IsModuloColumn(rel_loc_info, part_col_name); + return bRet; } /* - * IsTypeDistributable - * Returns whether the data type is distributable using a column value. + * Returns whether or not the data type is modulo distributable with PG-XC + * PGXCTODO - expand support for other data types! */ bool -IsTypeDistributable(Oid col_type) +IsTypeModuloDistributable(Oid col_type) { +#ifdef XCP + return (modulo_value_len(col_type) != -1); +#else if(col_type == INT8OID || col_type == INT2OID || col_type == OIDOID @@ -299,12 +515,68 @@ IsTypeDistributable(Oid col_type) return true; return false; +#endif +} + +/* + * GetRelationModuloColumn - return modulo column for relation. + * + * Returns NULL if the relation is not modulo partitioned. + */ +char * +GetRelationModuloColumn(RelationLocInfo * rel_loc_info) +{ + char *column_str = NULL; + + if (rel_loc_info == NULL) + column_str = NULL; + else if (rel_loc_info->locatorType != LOCATOR_TYPE_MODULO) + column_str = NULL; + else + { + int len = strlen(rel_loc_info->partAttrName); + + column_str = (char *) palloc(len + 1); + strncpy(column_str, rel_loc_info->partAttrName, len + 1); + } + + return column_str; +} + +/* + * IsModuloColumn - return whether or not column for relation is used for modulo distribution. + * + */ +bool +IsModuloColumn(RelationLocInfo *rel_loc_info, char *part_col_name) +{ + bool ret_value = false; + + if (!rel_loc_info || !part_col_name) + ret_value = false; + else if (rel_loc_info->locatorType != LOCATOR_TYPE_MODULO) + ret_value = false; + else + ret_value = !strcmp(part_col_name, rel_loc_info->partAttrName); + + return ret_value; } /* - * GetRoundRobinNode - * Update the round robin node for the relation. + * IsModuloColumnForRelId - return whether or not column for relation is used for modulo distribution. + */ +bool +IsModuloColumnForRelId(Oid relid, char *part_col_name) +{ + RelationLocInfo *rel_loc_info = GetRelationLocInfo(relid); + + return IsModuloColumn(rel_loc_info, part_col_name); +} + +/* + * Update the round robin node for the relation + * * PGXCTODO - may not want to bother with locking here, we could track * these in the session memory context instead... */ @@ -314,8 +586,13 @@ GetRoundRobinNode(Oid relid) int ret_node; Relation rel = relation_open(relid, AccessShareLock); - Assert (rel->rd_locator_info->locatorType == LOCATOR_TYPE_REPLICATED || +#ifdef XCP + Assert (IsLocatorReplicated(rel->rd_locator_info->locatorType) || rel->rd_locator_info->locatorType == LOCATOR_TYPE_RROBIN); +#else + Assert (rel->rd_locator_info->locatorType == LOCATOR_TYPE_REPLICATED || + rel->rd_locator_info->locatorType == LOCATOR_TYPE_RROBIN); +#endif ret_node = lfirst_int(rel->rd_locator_info->roundRobinNode); @@ -333,6 +610,7 @@ GetRoundRobinNode(Oid relid) /* * IsTableDistOnPrimary + * * Does the table distribution list include the primary node? */ bool @@ -342,13 +620,19 @@ IsTableDistOnPrimary(RelationLocInfo *rel_loc_info) if (!OidIsValid(primary_data_node) || rel_loc_info == NULL || - list_length(rel_loc_info->nodeList) == 0) + list_length(rel_loc_info->nodeList = 0)) return false; foreach(item, rel_loc_info->nodeList) { +#ifdef XCP + char ntype = PGXC_NODE_DATANODE; + if (PGXCNodeGetNodeId(primary_data_node, &ntype) == lfirst_int(item)) + return true; +#else if (PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE) == lfirst_int(item)) return true; +#endif } return false; } @@ -359,25 +643,24 @@ IsTableDistOnPrimary(RelationLocInfo *rel_loc_info) * Check equality of given locator information */ bool -IsLocatorInfoEqual(RelationLocInfo *locInfo1, - RelationLocInfo *locInfo2) +IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2) { List *nodeList1, *nodeList2; - Assert(locInfo1 && locInfo2); + Assert(rel_loc_info1 && rel_loc_info2); - nodeList1 = locInfo1->nodeList; - nodeList2 = locInfo2->nodeList; + nodeList1 = rel_loc_info1->nodeList; + nodeList2 = rel_loc_info2->nodeList; /* Same relation? */ - if (locInfo1->relid != locInfo2->relid) + if (rel_loc_info1->relid != rel_loc_info2->relid) return false; /* Same locator type? */ - if (locInfo1->locatorType != locInfo2->locatorType) + if (rel_loc_info1->locatorType != rel_loc_info2->locatorType) return false; /* Same attribute number? */ - if (locInfo1->partAttrNum != locInfo2->partAttrNum) + if (rel_loc_info1->partAttrNum != rel_loc_info2->partAttrNum) return false; /* Same node list? */ @@ -390,6 +673,7 @@ IsLocatorInfoEqual(RelationLocInfo *locInfo1, } +#ifndef XCP /* * GetRelationNodes * @@ -417,30 +701,22 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol, long hashValue; int modulo; int nodeIndex; + int k; if (rel_loc_info == NULL) return NULL; exec_nodes = makeNode(ExecNodes); exec_nodes->baselocatortype = rel_loc_info->locatorType; - exec_nodes->accesstype = accessType; switch (rel_loc_info->locatorType) { case LOCATOR_TYPE_REPLICATED: - /* - * When intention is to read from replicated table, return all the - * nodes so that planner can choose one depending upon the rest of - * the JOIN tree. But while reading with update lock, we need to - * read from the primary node (if exists) so as to avoid the - * deadlock. - * For write access set primary node (if exists). - */ - exec_nodes->nodeList = list_copy(rel_loc_info->nodeList); if (accessType == RELATION_ACCESS_UPDATE || accessType == RELATION_ACCESS_INSERT) { /* we need to write to all synchronously */ + exec_nodes->nodeList = list_concat(exec_nodes->nodeList, rel_loc_info->nodeList); /* * Write to primary node first, to reduce chance of a deadlock @@ -450,22 +726,57 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol, && exec_nodes->nodeList && list_length(exec_nodes->nodeList) > 1) /* make sure more than 1 */ { - exec_nodes->primarynodelist = list_make1_int(PGXCNodeGetNodeId(primary_data_node, - PGXC_NODE_DATANODE)); - exec_nodes->nodeList = list_delete_int(exec_nodes->nodeList, - PGXCNodeGetNodeId(primary_data_node, - PGXC_NODE_DATANODE)); + exec_nodes->primarynodelist = lappend_int(NULL, + PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE)); + list_delete_int(exec_nodes->nodeList, + PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE)); } } - else if (accessType == RELATION_ACCESS_READ_FOR_UPDATE && - IsTableDistOnPrimary(rel_loc_info)) + else { /* - * We should ensure row is locked on the primary node to - * avoid distributed deadlock if updating the same row - * concurrently + * In case there are nodes defined in location info, initialize node list + * with a default node being the first node in list. + * This node list may be changed if a better one is found afterwards. */ - exec_nodes->nodeList = list_make1_int(PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE)); + if (rel_loc_info->nodeList) + exec_nodes->nodeList = lappend_int(NULL, + linitial_int(rel_loc_info->nodeList)); + + if (accessType == RELATION_ACCESS_READ_FOR_UPDATE && + IsTableDistOnPrimary(rel_loc_info)) + { + /* + * We should ensure row is locked on the primary node to + * avoid distributed deadlock if updating the same row + * concurrently + */ + exec_nodes->nodeList = lappend_int(NULL, + PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE)); + } + else if (num_preferred_data_nodes > 0) + { + ListCell *item; + + foreach(item, rel_loc_info->nodeList) + { + for (k = 0; k < num_preferred_data_nodes; k++) + { + if (PGXCNodeGetNodeId(preferred_data_node[k], + PGXC_NODE_DATANODE) == lfirst_int(item)) + { + exec_nodes->nodeList = lappend_int(NULL, + lfirst_int(item)); + break; + } + } + } + } + + /* If nothing found just read from one of them. Use round robin mechanism */ + if (exec_nodes->nodeList == NULL) + exec_nodes->nodeList = lappend_int(NULL, + GetRoundRobinNode(rel_loc_info->relid)); } break; @@ -477,27 +788,37 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol, rel_loc_info->locatorType); modulo = compute_modulo(abs(hashValue), list_length(rel_loc_info->nodeList)); nodeIndex = get_node_from_modulo(modulo, rel_loc_info->nodeList); - exec_nodes->nodeList = list_make1_int(nodeIndex); + exec_nodes->nodeList = lappend_int(NULL, nodeIndex); } else { if (accessType == RELATION_ACCESS_INSERT) /* Insert NULL to first node*/ - exec_nodes->nodeList = list_make1_int(linitial_int(rel_loc_info->nodeList)); + exec_nodes->nodeList = lappend_int(NULL, linitial_int(rel_loc_info->nodeList)); else - exec_nodes->nodeList = list_copy(rel_loc_info->nodeList); + exec_nodes->nodeList = list_concat(exec_nodes->nodeList, rel_loc_info->nodeList); } break; + case LOCATOR_TYPE_SINGLE: + /* just return first (there should only be one) */ + exec_nodes->nodeList = list_concat(exec_nodes->nodeList, + rel_loc_info->nodeList); + break; + case LOCATOR_TYPE_RROBIN: - /* - * round robin, get next one in case of insert. If not insert, all - * node needed - */ + /* round robin, get next one */ if (accessType == RELATION_ACCESS_INSERT) - exec_nodes->nodeList = list_make1_int(GetRoundRobinNode(rel_loc_info->relid)); + { + /* write to just one of them */ + exec_nodes->nodeList = lappend_int(NULL, GetRoundRobinNode(rel_loc_info->relid)); + } else - exec_nodes->nodeList = list_copy(rel_loc_info->nodeList); + { + /* we need to read from all */ + exec_nodes->nodeList = list_concat(exec_nodes->nodeList, + rel_loc_info->nodeList); + } break; /* PGXCTODO case LOCATOR_TYPE_RANGE: */ @@ -534,7 +855,7 @@ GetRelationNodesByQuals(Oid reloid, Index varno, Node *quals, * If the table distributed by value, check if we can reduce the Datanodes * by looking at the qualifiers for this relation */ - if (IsRelationDistributedByValue(rel_loc_info)) + if (IsLocatorDistributedByValue(rel_loc_info->locatorType)) { Oid disttype = get_atttype(reloid, rel_loc_info->partAttrNum); int32 disttypmod = get_atttypmod(reloid, rel_loc_info->partAttrNum); @@ -584,26 +905,62 @@ GetRelationNodesByQuals(Oid reloid, Index varno, Node *quals, relaccess); return exec_nodes; } +#endif /* - * GetLocatorType - * Returns the locator type of the table. + * ConvertToLocatorType + * get locator distribution type + * We really should just have pgxc_class use disttype instead... + */ +char +ConvertToLocatorType(int disttype) +{ + char loctype = LOCATOR_TYPE_NONE; + + switch (disttype) + { + case DISTTYPE_HASH: + loctype = LOCATOR_TYPE_HASH; + break; + case DISTTYPE_ROUNDROBIN: + loctype = LOCATOR_TYPE_RROBIN; + break; + case DISTTYPE_REPLICATION: + loctype = LOCATOR_TYPE_REPLICATED; + break; + case DISTTYPE_MODULO: + loctype = LOCATOR_TYPE_MODULO; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("Invalid distribution type"))); + break; + } + + return loctype; +} + + +/* + * GetLocatorType - Returns the locator type of the table + * */ char GetLocatorType(Oid relid) { - char ret = LOCATOR_TYPE_NONE; - RelationLocInfo *locInfo = GetRelationLocInfo(relid); + char ret = '\0'; - if (locInfo != NULL) - ret = locInfo->locatorType; + RelationLocInfo *ret_loc_info = GetRelationLocInfo(relid); + + if (ret_loc_info != NULL) + ret = ret_loc_info->locatorType; return ret; } /* - * GetAllDataNodes * Return a list of all Datanodes. * We assume all tables use all nodes in the prototype, so just return a list * from first one. @@ -621,7 +978,6 @@ GetAllDataNodes(void) } /* - * GetAllCoordNodes * Return a list of all Coordinators * This is used to send DDL to all nodes and to clean up pooler connections. * Do not put in the list the local Coordinator where this function is launched. @@ -648,7 +1004,6 @@ GetAllCoordNodes(void) /* - * RelationBuildLocator * Build locator information associated with the specified relation. */ void @@ -693,12 +1048,24 @@ RelationBuildLocator(Relation rel) relationLocInfo->locatorType = pgxc_class->pclocatortype; relationLocInfo->partAttrNum = pgxc_class->pcattnum; + + relationLocInfo->partAttrName = get_attname(relationLocInfo->relid, pgxc_class->pcattnum); + relationLocInfo->nodeList = NIL; +#ifdef XCP + for (j = 0; j < pgxc_class->nodeoids.dim1; j++) + { + char ntype = PGXC_NODE_DATANODE; + int nid = PGXCNodeGetNodeId(pgxc_class->nodeoids.values[j], &ntype); + relationLocInfo->nodeList = lappend_int(relationLocInfo->nodeList, nid); + } +#else for (j = 0; j < pgxc_class->nodeoids.dim1; j++) relationLocInfo->nodeList = lappend_int(relationLocInfo->nodeList, PGXCNodeGetNodeId(pgxc_class->nodeoids.values[j], PGXC_NODE_DATANODE)); +#endif /* * If the locator type is round robin, we set a node to @@ -706,7 +1073,11 @@ RelationBuildLocator(Relation rel) * we choose a node to use for balancing reads. */ if (relationLocInfo->locatorType == LOCATOR_TYPE_RROBIN +#ifdef XCP + || IsLocatorReplicated(relationLocInfo->locatorType)) +#else || relationLocInfo->locatorType == LOCATOR_TYPE_REPLICATED) +#endif { int offset; /* @@ -728,8 +1099,7 @@ RelationBuildLocator(Relation rel) } /* - * GetLocatorRelationInfo - * Returns the locator information for relation, + * GetLocatorRelationInfo - Returns the locator information for relation, * in a copy of the RelationLocatorInfo struct in relcache */ RelationLocInfo * @@ -750,43 +1120,61 @@ GetRelationLocInfo(Oid relid) } /* - * CopyRelationLocInfo + * Get the distribution type of relation. + */ +char +GetRelationLocType(Oid relid) +{ + RelationLocInfo *locinfo = GetRelationLocInfo(relid); + if (!locinfo) + return LOCATOR_TYPE_NONE; + + return locinfo->locatorType; +} + +/* * Copy the RelationLocInfo struct */ RelationLocInfo * -CopyRelationLocInfo(RelationLocInfo *srcInfo) +CopyRelationLocInfo(RelationLocInfo * src_info) { - RelationLocInfo *destInfo; + RelationLocInfo *dest_info; + + Assert(src_info); - Assert(srcInfo); - destInfo = (RelationLocInfo *) palloc0(sizeof(RelationLocInfo)); + dest_info = (RelationLocInfo *) palloc0(sizeof(RelationLocInfo)); - destInfo->relid = srcInfo->relid; - destInfo->locatorType = srcInfo->locatorType; - destInfo->partAttrNum = srcInfo->partAttrNum; - if (srcInfo->nodeList) - destInfo->nodeList = list_copy(srcInfo->nodeList); + dest_info->relid = src_info->relid; + dest_info->locatorType = src_info->locatorType; + dest_info->partAttrNum = src_info->partAttrNum; + if (src_info->partAttrName) + dest_info->partAttrName = pstrdup(src_info->partAttrName); - /* Note: for roundrobin, we use the relcache entry */ - return destInfo; + if (src_info->nodeList) + dest_info->nodeList = list_copy(src_info->nodeList); + /* Note, for round robin, we use the relcache entry */ + + return dest_info; } /* - * FreeRelationLocInfo * Free RelationLocInfo struct */ void FreeRelationLocInfo(RelationLocInfo *relationLocInfo) { if (relationLocInfo) + { + if (relationLocInfo->partAttrName) + pfree(relationLocInfo->partAttrName); pfree(relationLocInfo); + } } + /* - * FreeExecNodes - * Free the contents of the ExecNodes expression - */ + * Free the contents of the ExecNodes expression */ void FreeExecNodes(ExecNodes **exec_nodes) { @@ -801,6 +1189,699 @@ FreeExecNodes(ExecNodes **exec_nodes) *exec_nodes = NULL; } + +#ifdef XCP +/* + * Determine value length in bytes for specified type for a module locator. + * Return -1 if module locator is not supported for the type. + */ +static int +modulo_value_len(Oid dataType) +{ + switch (dataType) + { + case BOOLOID: + case CHAROID: + return 1; + case INT2OID: + return 2; + case INT4OID: + case ABSTIMEOID: + case RELTIMEOID: + case DATEOID: + return 4; + default: + return -1; + } +} + + +static LocatorHashFunc +hash_func_ptr(Oid dataType) +{ + switch (dataType) + { + case INT8OID: + case CASHOID: + return hashint8; + case INT2OID: + return hashint2; + case OIDOID: + return hashoid; + case INT4OID: + case ABSTIMEOID: + case RELTIMEOID: + case DATEOID: + return hashint4; + case BOOLOID: + case CHAROID: + return hashchar; + case NAMEOID: + return hashname; + case INT2VECTOROID: + return hashint2vector; + case VARCHAROID: + case TEXTOID: + return hashtext; + case OIDVECTOROID: + return hashoidvector; + case BPCHAROID: + return hashbpchar; + case BYTEAOID: + return hashvarlena; + case TIMEOID: + return time_hash; + case TIMESTAMPOID: + case TIMESTAMPTZOID: + return timestamp_hash; + case INTERVALOID: + return interval_hash; + case TIMETZOID: + return timetz_hash; + case NUMERICOID: + return hash_numeric; + case UUIDOID: + return uuid_hash; + default: + return NULL; + } +} + + +Locator * +createLocator(char locatorType, RelationAccessType accessType, + Oid dataType, LocatorListType listType, int nodeCount, + void *nodeList, void **result, bool primary) +{ + Locator *locator; + ListCell *lc; + void *nodeMap; + int i; + + locator = (Locator *) palloc(sizeof(Locator)); + locator->dataType = dataType; + locator->listType = listType; + locator->nodeCount = nodeCount; + /* Create node map */ + switch (listType) + { + case LOCATOR_LIST_NONE: + /* No map, return indexes */ + nodeMap = NULL; + break; + case LOCATOR_LIST_INT: + /* Copy integer array */ + nodeMap = palloc(nodeCount * sizeof(int)); + memcpy(nodeMap, nodeList, nodeCount * sizeof(int)); + break; + case LOCATOR_LIST_OID: + /* Copy array of Oids */ + nodeMap = palloc(nodeCount * sizeof(Oid)); + memcpy(nodeMap, nodeList, nodeCount * sizeof(Oid)); + break; + case LOCATOR_LIST_POINTER: + /* Copy array of Oids */ + nodeMap = palloc(nodeCount * sizeof(void *)); + memcpy(nodeMap, nodeList, nodeCount * sizeof(void *)); + break; + case LOCATOR_LIST_LIST: + /* Create map from list */ + { + List *l = (List *) nodeList; + locator->nodeCount = list_length(l); + if (IsA(l, IntList)) + { + int *intptr; + nodeMap = palloc(locator->nodeCount * sizeof(int)); + intptr = (int *) nodeMap; + foreach(lc, l) + *intptr++ = lfirst_int(lc); + locator->listType = LOCATOR_LIST_INT; + } + else if (IsA(l, OidList)) + { + Oid *oidptr; + nodeMap = palloc(locator->nodeCount * sizeof(Oid)); + oidptr = (Oid *) nodeMap; + foreach(lc, l) + *oidptr++ = lfirst_oid(lc); + locator->listType = LOCATOR_LIST_OID; + } + else if (IsA(l, List)) + { + void **voidptr; + nodeMap = palloc(locator->nodeCount * sizeof(void *)); + voidptr = (void **) nodeMap; + foreach(lc, l) + *voidptr++ = lfirst(lc); + locator->listType = LOCATOR_LIST_POINTER; + } + else + { + /* can not get here */ + Assert(false); + } + break; + } + } + /* + * Determine locatefunc, allocate results, set up parameters + * specific to locator type + */ + switch (locatorType) + { + case LOCATOR_TYPE_REPLICATED: + if (accessType == RELATION_ACCESS_INSERT || + accessType == RELATION_ACCESS_UPDATE) + { + locator->locatefunc = locate_static; + if (nodeMap == NULL) + { + /* no map, prepare array with indexes */ + int *intptr; + nodeMap = palloc(locator->nodeCount * sizeof(int)); + intptr = (int *) nodeMap; + for (i = 0; i < locator->nodeCount; i++) + *intptr++ = i; + } + locator->nodeMap = nodeMap; + locator->results = nodeMap; + } + else + { + locator->locatefunc = locate_roundrobin; + locator->nodeMap = nodeMap; + switch (locator->listType) + { + case LOCATOR_LIST_NONE: + case LOCATOR_LIST_INT: + locator->results = palloc(sizeof(int)); + break; + case LOCATOR_LIST_OID: + locator->results = palloc(sizeof(Oid)); + break; + case LOCATOR_LIST_POINTER: + locator->results = palloc(sizeof(void *)); + break; + case LOCATOR_LIST_LIST: + /* Should never happen */ + Assert(false); + break; + } + locator->roundRobinNode = -1; + } + break; + case LOCATOR_TYPE_RROBIN: + if (accessType == RELATION_ACCESS_INSERT) + { + locator->locatefunc = locate_roundrobin; + locator->nodeMap = nodeMap; + switch (locator->listType) + { + case LOCATOR_LIST_NONE: + case LOCATOR_LIST_INT: + locator->results = palloc(sizeof(int)); + break; + case LOCATOR_LIST_OID: + locator->results = palloc(sizeof(Oid)); + break; + case LOCATOR_LIST_POINTER: + locator->results = palloc(sizeof(void *)); + break; + case LOCATOR_LIST_LIST: + /* Should never happen */ + Assert(false); + break; + } + locator->roundRobinNode = -1; + } + else + { + locator->locatefunc = locate_static; + if (nodeMap == NULL) + { + /* no map, prepare array with indexes */ + int *intptr; + nodeMap = palloc(locator->nodeCount * sizeof(int)); + intptr = (int *) nodeMap; + for (i = 0; i < locator->nodeCount; i++) + *intptr++ = i; + } + locator->nodeMap = nodeMap; + locator->results = nodeMap; + } + break; + case LOCATOR_TYPE_HASH: + if (accessType == RELATION_ACCESS_INSERT) + { + locator->locatefunc = locate_hash_insert; + locator->nodeMap = nodeMap; + switch (locator->listType) + { + case LOCATOR_LIST_NONE: + case LOCATOR_LIST_INT: + locator->results = palloc(sizeof(int)); + break; + case LOCATOR_LIST_OID: + locator->results = palloc(sizeof(Oid)); + break; + case LOCATOR_LIST_POINTER: + locator->results = palloc(sizeof(void *)); + break; + case LOCATOR_LIST_LIST: + /* Should never happen */ + Assert(false); + break; + } + } + else + { + locator->locatefunc = locate_hash_select; + locator->nodeMap = nodeMap; + switch (locator->listType) + { + case LOCATOR_LIST_NONE: + case LOCATOR_LIST_INT: + locator->results = palloc(locator->nodeCount * sizeof(int)); + break; + case LOCATOR_LIST_OID: + locator->results = palloc(locator->nodeCount * sizeof(Oid)); + break; + case LOCATOR_LIST_POINTER: + locator->results = palloc(locator->nodeCount * sizeof(void *)); + break; + case LOCATOR_LIST_LIST: + /* Should never happen */ + Assert(false); + break; + } + } + + locator->hashfunc = hash_func_ptr(dataType); + if (locator->hashfunc == NULL) + ereport(ERROR, (errmsg("Error: unsupported data type for HASH locator: %d\n", + dataType))); + break; + case LOCATOR_TYPE_MODULO: + if (accessType == RELATION_ACCESS_INSERT) + { + locator->locatefunc = locate_modulo_insert; + locator->nodeMap = nodeMap; + switch (locator->listType) + { + case LOCATOR_LIST_NONE: + case LOCATOR_LIST_INT: + locator->results = palloc(sizeof(int)); + break; + case LOCATOR_LIST_OID: + locator->results = palloc(sizeof(Oid)); + break; + case LOCATOR_LIST_POINTER: + locator->results = palloc(sizeof(void *)); + break; + case LOCATOR_LIST_LIST: + /* Should never happen */ + Assert(false); + break; + } + } + else + { + locator->locatefunc = locate_modulo_select; + locator->nodeMap = nodeMap; + switch (locator->listType) + { + case LOCATOR_LIST_NONE: + case LOCATOR_LIST_INT: + locator->results = palloc(locator->nodeCount * sizeof(int)); + break; + case LOCATOR_LIST_OID: + locator->results = palloc(locator->nodeCount * sizeof(Oid)); + break; + case LOCATOR_LIST_POINTER: + locator->results = palloc(locator->nodeCount * sizeof(void *)); + break; + case LOCATOR_LIST_LIST: + /* Should never happen */ + Assert(false); + break; + } + } + + locator->valuelen = modulo_value_len(dataType); + if (locator->valuelen == -1) + ereport(ERROR, (errmsg("Error: unsupported data type for MODULO locator: %d\n", + dataType))); + break; + default: + ereport(ERROR, (errmsg("Error: no such supported locator type: %c\n", + locatorType))); + } + + if (result) + *result = locator->results; + + return locator; +} + + +void +freeLocator(Locator *locator) +{ + pfree(locator->nodeMap); + /* + * locator->nodeMap and locator->results may point to the same memory, + * do not free it twice + */ + if (locator->results != locator->nodeMap) + pfree(locator->results); + pfree(locator); +} + + +/* + * Each time return the same predefined results + */ +static int +locate_static(Locator *self, Datum value, bool isnull, + bool *hasprimary) +{ + /* TODO */ + if (hasprimary) + *hasprimary = false; + return self->nodeCount; +} + + +/* + * Each time return one next node, in round robin manner + */ +static int +locate_roundrobin(Locator *self, Datum value, bool isnull, + bool *hasprimary) +{ + /* TODO */ + if (hasprimary) + *hasprimary = false; + if (++self->roundRobinNode >= self->nodeCount) + self->roundRobinNode = 0; + switch (self->listType) + { + case LOCATOR_LIST_NONE: + ((int *) self->results)[0] = self->roundRobinNode; + break; + case LOCATOR_LIST_INT: + ((int *) self->results)[0] = + ((int *) self->nodeMap)[self->roundRobinNode]; + break; + case LOCATOR_LIST_OID: + ((Oid *) self->results)[0] = + ((Oid *) self->nodeMap)[self->roundRobinNode]; + break; + case LOCATOR_LIST_POINTER: + ((void **) self->results)[0] = + ((void **) self->nodeMap)[self->roundRobinNode]; + break; + case LOCATOR_LIST_LIST: + /* Should never happen */ + Assert(false); + break; + } + return 1; +} + + +/* + * Calculate hash from supplied value and use modulo by nodeCount as an index + */ +static int +locate_hash_insert(Locator *self, Datum value, bool isnull, + bool *hasprimary) +{ + int index; + if (hasprimary) + *hasprimary = false; + if (isnull) + index = 0; + else + { + unsigned int hash32; + + hash32 = (unsigned int) DatumGetInt32(DirectFunctionCall1(self->hashfunc, value)); + + index = compute_modulo(hash32, self->nodeCount); + } + switch (self->listType) + { + case LOCATOR_LIST_NONE: + ((int *) self->results)[0] = index; + break; + case LOCATOR_LIST_INT: + ((int *) self->results)[0] = ((int *) self->nodeMap)[index]; + break; + case LOCATOR_LIST_OID: + ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index]; + break; + case LOCATOR_LIST_POINTER: + ((void **) self->results)[0] = ((void **) self->nodeMap)[index]; + break; + case LOCATOR_LIST_LIST: + /* Should never happen */ + Assert(false); + break; + } + return 1; +} + + +/* + * Calculate hash from supplied value and use modulo by nodeCount as an index + * if value is NULL assume no hint and return all the nodes. + */ +static int +locate_hash_select(Locator *self, Datum value, bool isnull, + bool *hasprimary) +{ + if (hasprimary) + *hasprimary = false; + if (isnull) + { + int i; + switch (self->listType) + { + case LOCATOR_LIST_NONE: + for (i = 0; i < self->nodeCount; i++) + ((int *) self->results)[i] = i; + break; + case LOCATOR_LIST_INT: + memcpy(self->results, self->nodeMap, + self->nodeCount * sizeof(int)); + break; + case LOCATOR_LIST_OID: + memcpy(self->results, self->nodeMap, + self->nodeCount * sizeof(Oid)); + break; + case LOCATOR_LIST_POINTER: + memcpy(self->results, self->nodeMap, + self->nodeCount * sizeof(void *)); + break; + case LOCATOR_LIST_LIST: + /* Should never happen */ + Assert(false); + break; + } + return self->nodeCount; + } + else + { + unsigned int hash32; + int index; + + hash32 = (unsigned int) DatumGetInt32(DirectFunctionCall1(self->hashfunc, value)); + + index = compute_modulo(hash32, self->nodeCount); + switch (self->listType) + { + case LOCATOR_LIST_NONE: + ((int *) self->results)[0] = index; + break; + case LOCATOR_LIST_INT: + ((int *) self->results)[0] = ((int *) self->nodeMap)[index]; + break; + case LOCATOR_LIST_OID: + ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index]; + break; + case LOCATOR_LIST_POINTER: + ((void **) self->results)[0] = ((void **) self->nodeMap)[index]; + break; + case LOCATOR_LIST_LIST: + /* Should never happen */ + Assert(false); + break; + } + return 1; + } +} + + +/* + * Use modulo of supplied value by nodeCount as an index + */ +static int +locate_modulo_insert(Locator *self, Datum value, bool isnull, + bool *hasprimary) +{ + int index; + if (hasprimary) + *hasprimary = false; + if (isnull) + index = 0; + else + { + unsigned int mod32; + + if (self->valuelen == 4) + mod32 = (unsigned int) (GET_4_BYTES(value)); + else if (self->valuelen == 2) + mod32 = (unsigned int) (GET_2_BYTES(value)); + else if (self->valuelen == 1) + mod32 = (unsigned int) (GET_1_BYTE(value)); + else + mod32 = 0; + + index = compute_modulo(mod32, self->nodeCount); + } + switch (self->listType) + { + case LOCATOR_LIST_NONE: + ((int *) self->results)[0] = index; + break; + case LOCATOR_LIST_INT: + ((int *) self->results)[0] = ((int *) self->nodeMap)[index]; + break; + case LOCATOR_LIST_OID: + ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index]; + break; + case LOCATOR_LIST_POINTER: + ((void **) self->results)[0] = ((void **) self->nodeMap)[index]; + break; + case LOCATOR_LIST_LIST: + /* Should never happen */ + Assert(false); + break; + } + return 1; +} + + +/* + * Use modulo of supplied value by nodeCount as an index + * if value is NULL assume no hint and return all the nodes. + */ +static int +locate_modulo_select(Locator *self, Datum value, bool isnull, + bool *hasprimary) +{ + if (hasprimary) + *hasprimary = false; + if (isnull) + { + int i; + switch (self->listType) + { + case LOCATOR_LIST_NONE: + for (i = 0; i < self->nodeCount; i++) + ((int *) self->results)[i] = i; + break; + case LOCATOR_LIST_INT: + memcpy(self->results, self->nodeMap, + self->nodeCount * sizeof(int)); + break; + case LOCATOR_LIST_OID: + memcpy(self->results, self->nodeMap, + self->nodeCount * sizeof(Oid)); + break; + case LOCATOR_LIST_POINTER: + memcpy(self->results, self->nodeMap, + self->nodeCount * sizeof(void *)); + break; + case LOCATOR_LIST_LIST: + /* Should never happen */ + Assert(false); + break; + } + return self->nodeCount; + } + else + { + unsigned int mod32; + int index; + + if (self->valuelen == 4) + mod32 = (unsigned int) (GET_4_BYTES(value)); + else if (self->valuelen == 2) + mod32 = (unsigned int) (GET_2_BYTES(value)); + else if (self->valuelen == 1) + mod32 = (unsigned int) (GET_1_BYTE(value)); + else + mod32 = 0; + + index = compute_modulo(mod32, self->nodeCount); + + switch (self->listType) + { + case LOCATOR_LIST_NONE: + ((int *) self->results)[0] = index; + break; + case LOCATOR_LIST_INT: + ((int *) self->results)[0] = ((int *) self->nodeMap)[index]; + break; + case LOCATOR_LIST_OID: + ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index]; + break; + case LOCATOR_LIST_POINTER: + ((void **) self->results)[0] = ((void **) self->nodeMap)[index]; + break; + case LOCATOR_LIST_LIST: + /* Should never happen */ + Assert(false); + break; + } + return 1; + } +} + + +int +GET_NODES(Locator *self, Datum value, bool isnull, bool *hasprimary) +{ + return (*self->locatefunc) (self, value, isnull, hasprimary); +} + + +void * +getLocatorResults(Locator *self) +{ + return self->results; +} + + +void * +getLocatorNodeMap(Locator *self) +{ + return self->nodeMap; +} + + +int +getLocatorNodeCount(Locator *self) +{ + return self->nodeCount; +} +#endif + + +#ifndef XCP /* * pgxc_find_distcol_expr * Search through the quals provided and find out an expression which will give @@ -814,23 +1895,12 @@ FreeExecNodes(ExecNodes **exec_nodes) * this function returns NULL. */ static Expr * -pgxc_find_distcol_expr(Index varno, - AttrNumber attrNum, - Node *quals) +pgxc_find_distcol_expr(Index varno, PartAttrNumber partAttrNum, + Node *quals) { - List *lquals; + /* Convert the qualification into list of arguments of AND */ + List *lquals = make_ands_implicit((Expr *)quals); ListCell *qual_cell; - - /* If no quals, no distribution column expression */ - if (!quals) - return NULL; - - /* Convert the qualification into List if it's not already so */ - if (!IsA(quals, List)) - lquals = make_ands_implicit((Expr *)quals); - else - lquals = (List *)quals; - /* * For every ANDed expression, check if that expression is of the form * <distribution_col> = <expr>. If so return expr. @@ -888,7 +1958,7 @@ pgxc_find_distcol_expr(Index varno, * If Var found is not the distribution column of required relation, * check next qual */ - if (var_expr->varno != varno || var_expr->varattno != attrNum) + if (var_expr->varno != varno || var_expr->varattno != partAttrNum) continue; /* * If the operator is not an assignment operator, check next @@ -907,3 +1977,4 @@ pgxc_find_distcol_expr(Index varno, /* Exhausted all quals, but no distribution column expression */ return NULL; } +#endif diff --git a/src/backend/pgxc/locator/redistrib.c b/src/backend/pgxc/locator/redistrib.c index c99bfe822e..98a6f6e355 100644 --- a/src/backend/pgxc/locator/redistrib.c +++ b/src/backend/pgxc/locator/redistrib.c @@ -159,8 +159,8 @@ pgxc_redist_build_replicate_to_distrib(RedistribState *distribState, return; /* Redistribution is done from replication to distributed (with value) */ - if (!IsRelationReplicated(oldLocInfo) || - !IsRelationDistributedByValue(newLocInfo)) + if (!IsLocatorReplicated(oldLocInfo->locatorType) || + !IsLocatorDistributedByValue(newLocInfo->locatorType)) return; /* Get the list of nodes that are added to the relation */ @@ -243,8 +243,8 @@ pgxc_redist_build_replicate(RedistribState *distribState, return; /* Case of a replicated table whose set of nodes is changed */ - if (!IsRelationReplicated(newLocInfo) || - !IsRelationReplicated(oldLocInfo)) + if (!IsLocatorReplicated(newLocInfo->locatorType) || + !IsLocatorReplicated(oldLocInfo->locatorType)) return; /* Get the list of nodes that are added to the relation */ @@ -410,6 +410,18 @@ distrib_copy_to(RedistribState *distribState) get_namespace_name(RelationGetNamespace(rel)), RelationGetRelationName(rel)))); +#ifdef XCP + /* Begin the COPY process */ + DataNodeCopyBegin(copyState); + + /* Create tuplestore storage */ + store = tuplestore_begin_message(false, work_mem); + + /* Then get rows and copy them to the tuplestore used for redistribution */ + DataNodeCopyStore( + (PGXCNodeHandle **) getLocatorNodeMap(copyState->locator), + getLocatorNodeCount(copyState->locator), store); +#else /* Begin the COPY process */ copyState->connections = DataNodeCopyBegin(copyState->query_buf.data, copyState->exec_nodes->nodeList, @@ -425,6 +437,7 @@ distrib_copy_to(RedistribState *distribState) NULL, store, /* Tuplestore used for redistribution */ REMOTE_COPY_TUPLESTORE); +#endif /* Do necessary clean-up */ FreeRemoteCopyOptions(options); @@ -450,8 +463,17 @@ distrib_copy_from(RedistribState *distribState, ExecNodes *exec_nodes) Relation rel; RemoteCopyOptions *options; RemoteCopyData *copyState; +#ifndef XCP bool replicated, contains_tuple = true; +#endif TupleDesc tupdesc; +#ifdef XCP + /* May be needed to decode partitioning value */ + int partIdx; + FmgrInfo in_function; + Oid typioparam; + int typmod; +#endif /* Nothing to do if on remote node */ if (IS_PGXC_DATANODE || IsConnFromCoord()) @@ -472,6 +494,14 @@ distrib_copy_from(RedistribState *distribState, ExecNodes *exec_nodes) RemoteCopy_GetRelationLoc(copyState, rel, NIL); RemoteCopy_BuildStatement(copyState, rel, options, NIL, NIL); +#ifdef XCP + /* Modify relation location as requested */ + if (exec_nodes) + { + if (exec_nodes->nodeList) + copyState->rel_loc->nodeList = exec_nodes->nodeList; + } +#else /* * When building COPY FROM command in redistribution list, * use the list of nodes that has been calculated there. @@ -482,8 +512,37 @@ distrib_copy_from(RedistribState *distribState, ExecNodes *exec_nodes) copyState->exec_nodes->nodeList = exec_nodes->nodeList; copyState->rel_loc->nodeList = exec_nodes->nodeList; } +#endif tupdesc = RelationGetDescr(rel); +#ifdef XCP + if (AttributeNumberIsValid(copyState->rel_loc->partAttrNum)) + { + Oid in_func_oid; + int dropped = 0; + int i; + + partIdx = copyState->rel_loc->partAttrNum - 1; + + /* prepare function to decode partitioning value */ + getTypeInputInfo(copyState->dist_type, + &in_func_oid, &typioparam); + fmgr_info(in_func_oid, &in_function); + typmod = tupdesc->attrs[partIdx]->atttypmod; + + /* + * Make partIdx pointing to correct field of the datarow. + * The data row does not contain data of dropped attributes, we should + * decrement partIdx appropriately + */ + for (i = 0; i < partIdx; i++) + { + if (tupdesc->attrs[i]->attisdropped) + dropped++; + } + partIdx -= dropped; + } +#endif /* Inform client of operation being done */ ereport(DEBUG1, @@ -491,6 +550,55 @@ distrib_copy_from(RedistribState *distribState, ExecNodes *exec_nodes) get_namespace_name(RelationGetNamespace(rel)), RelationGetRelationName(rel)))); +#ifdef XCP + DataNodeCopyBegin(copyState); + + /* Send each COPY message stored to remote nodes */ + while (true) + { + char *data; + int len; + Datum value = (Datum) 0; + bool is_null = true; + + /* Get message from the tuplestore */ + data = tuplestore_getmessage(store, &len); + if (!data) + break; + + /* Find value of distribution column if necessary */ + if (AttributeNumberIsValid(copyState->rel_loc->partAttrNum)) + { + char **fields; + + /* + * Split message on an array of fields. + * Last \n is not included in converted message. + */ + fields = CopyOps_RawDataToArrayField(tupdesc, data, len - 1); + + /* Determine partitioning value */ + if (fields[partIdx]) + { + value = InputFunctionCall(&in_function, fields[partIdx], + typioparam, typmod); + is_null = false; + } + } + + if (DataNodeCopyIn(data, len, + GET_NODES(copyState->locator, value, is_null, NULL), + (PGXCNodeHandle**) getLocatorResults(copyState->locator))) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("Copy failed on a data node"))); + + /* Clean up */ + pfree(data); + } + DataNodeCopyFinish(getLocatorNodeCount(copyState->locator), + (PGXCNodeHandle **) getLocatorNodeMap(copyState->locator)); +#else /* Begin redistribution on remote nodes */ copyState->connections = DataNodeCopyBegin(copyState->query_buf.data, copyState->exec_nodes->nodeList, @@ -561,6 +669,7 @@ distrib_copy_from(RedistribState *distribState, ExecNodes *exec_nodes) DataNodeCopyFinish(copyState->connections, replicated ? PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE) : -1, replicated ? COMBINE_TYPE_SAME : COMBINE_TYPE_SUM); +#endif /* Lock is maintained until transaction commits */ relation_close(rel, NoLock); @@ -721,8 +830,10 @@ distrib_delete_hash(RedistribState *distribState, ExecNodes *exec_nodes) hashfuncname = get_compute_hash_function(hashtype, locinfo->locatorType); /* Get distribution column name */ - if (IsRelationDistributedByValue(locinfo)) - colname = GetRelationDistribColumn(locinfo); + if (locinfo->locatorType == LOCATOR_TYPE_HASH) + colname = GetRelationHashColumn(locinfo); + else if (locinfo->locatorType == LOCATOR_TYPE_MODULO) + colname = GetRelationModuloColumn(locinfo); else ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -859,7 +970,9 @@ distrib_execute_query(char *sql, bool is_temp, ExecNodes *exec_nodes) /* Redistribution operations only concern Datanodes */ step->exec_type = EXEC_ON_DATANODES; +#ifndef XCP step->is_temp = is_temp; +#endif ExecRemoteUtility(step); pfree(step->sql_statement); pfree(step); diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c index ee5ef63efb..dc77212dfd 100644 --- a/src/backend/pgxc/nodemgr/nodemgr.c +++ b/src/backend/pgxc/nodemgr/nodemgr.c @@ -204,6 +204,15 @@ check_node_options(const char *node_name, List *options, char **node_host, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("PGXC node %s: Node type not specified", node_name))); + +#ifdef XCP + if (node_type == PGXC_NODE_DATANODE && NumDataNodes >= MaxDataNodes) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("Too many datanodes, current value of max_data_nodes is %d", + MaxDataNodes))); + +#endif } /* @@ -347,6 +356,9 @@ PgxcNodeListAndCount(void) heap_endscan(scan); heap_close(rel, AccessShareLock); + elog(DEBUG1, "Done pgxc_nodes scan: %d coordinators and %d datanodes", + *shmemNumCoords, *shmemNumDataNodes); + /* Finally sort the lists */ if (*shmemNumCoords > 1) qsort(coDefs, *shmemNumCoords, sizeof(NodeDefinition), cmp_nodes); @@ -372,6 +384,9 @@ PgxcNodeGetOids(Oid **coOids, Oid **dnOids, { LWLockAcquire(NodeTableLock, LW_SHARED); + elog(DEBUG1, "Get OIDs from table: %d coordinators and %d datanodes", + *shmemNumCoords, *shmemNumDataNodes); + if (num_coords) *num_coords = *shmemNumCoords; if (num_dns) @@ -656,6 +671,13 @@ PgxcNodeAlter(AlterNodeStmt *stmt) node_name))); /* Check type dependency */ +#ifndef XCP + /* + * XCP: + * Initially node identify itself as a Coordinator and this should be + * changed for datanodes. In general, it should be safe to turn + * Coordinator to Datanode and back + */ if (node_type_old == PGXC_NODE_COORDINATOR && node_type == PGXC_NODE_DATANODE) ereport(ERROR, @@ -668,6 +690,7 @@ PgxcNodeAlter(AlterNodeStmt *stmt) (errcode(ERRCODE_SYNTAX_ERROR), errmsg("PGXC node %s: cannot alter Datanode to Coordinator", node_name))); +#endif /* Update values for catalog entry */ MemSet(new_record, 0, sizeof(new_record)); diff --git a/src/backend/pgxc/plan/Makefile b/src/backend/pgxc/plan/Makefile new file mode 100644 index 0000000000..c322c03656 --- /dev/null +++ b/src/backend/pgxc/plan/Makefile @@ -0,0 +1,19 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for rewrite +# +# Portions Copyright(C) 2010-2012 Postgres-XC Development Group +# +# IDENTIFICATION +# $PostgreSQL$ +# +#------------------------------------------------------------------------- + +subdir = src/backend/pgxc/plan +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = planner.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c new file mode 100644 index 0000000000..d294063f5c --- /dev/null +++ b/src/backend/pgxc/plan/planner.c @@ -0,0 +1,2282 @@ +/*------------------------------------------------------------------------- + * + * planner.c + * + * Functions for generating a PGXC style plan. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group + * + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "miscadmin.h" +#include "access/transam.h" +#include "catalog/pg_aggregate.h" +#include "catalog/pg_class.h" +#include "catalog/pg_inherits_fn.h" +#include "catalog/pg_namespace.h" +#include "catalog/pg_proc.h" +#include "catalog/pg_type.h" +#include "catalog/pgxc_node.h" +#include "commands/prepare.h" +#include "executor/executor.h" +#include "lib/stringinfo.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "nodes/nodes.h" +#include "nodes/parsenodes.h" +#include "optimizer/clauses.h" +#include "optimizer/planmain.h" +#include "optimizer/planner.h" +#include "optimizer/tlist.h" +#include "parser/parse_agg.h" +#include "parser/parse_func.h" +#include "parser/parse_relation.h" +#include "parser/parsetree.h" +#include "parser/parse_oper.h" +#include "pgxc/execRemote.h" +#include "pgxc/pgxc.h" +#include "pgxc/locator.h" +#include "pgxc/nodemgr.h" +#include "pgxc/planner.h" +#include "pgxc/postgresql_fdw.h" +#include "tcop/pquery.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/lsyscache.h" +#include "utils/portal.h" +#include "utils/syscache.h" +#include "utils/numeric.h" +#include "utils/memutils.h" +#include "access/hash.h" +#include "commands/tablecmds.h" +#include "utils/timestamp.h" +#include "utils/date.h" + +#ifndef XCP +/* Forbid unsafe SQL statements */ +bool StrictStatementChecking = true; +/* fast query shipping is enabled by default */ +bool enable_fast_query_shipping = true; + +static RemoteQuery *makeRemoteQuery(void); +static void validate_part_col_updatable(const Query *query); +static bool contains_temp_tables(List *rtable); +static bool contains_only_pg_catalog(List *rtable); +static void pgxc_handle_unsupported_stmts(Query *query); +static PlannedStmt *pgxc_FQS_planner(Query *query, int cursorOptions, + ParamListInfo boundParams); +static bool pgxc_query_needs_coord(Query *query); +static ExecNodes *pgxc_is_query_shippable(Query *query, int query_level); +static void pgxc_FQS_find_datanodes(Shippability_context *sc_context); +static ExecNodes *pgxc_merge_exec_nodes(ExecNodes *exec_nodes1, + ExecNodes *exec_nodes2, + bool merge_dist_equijoin, + bool merge_replicated_only); +static PlannedStmt *pgxc_handle_exec_direct(Query *query, int cursorOptions, + ParamListInfo boundParams); +static RemoteQuery *pgxc_FQS_create_remote_plan(Query *query, + ExecNodes *exec_nodes, + bool is_exec_direct); +static void pgxc_set_remote_parameters(PlannedStmt *plan, ParamListInfo boundParams); +static ExecNodes *pgxc_FQS_get_relation_nodes(RangeTblEntry *rte, Index varno, + Query *query); +static bool pgxc_qual_hash_dist_equijoin(Relids varnos_1, Relids varnos_2, + Oid distcol_type, Node *quals, + List *rtable); +static bool VarAttrIsPartAttr(Var *var, List *rtable); +static void pgxc_set_shippability_reason(Shippability_context *context, ShippabilityStat reason); + +/* + * make_ctid_col_ref + * + * creates a Var for a column referring to ctid + */ + +static Var * +make_ctid_col_ref(Query *qry) +{ + ListCell *lc1, *lc2; + RangeTblEntry *rte1, *rte2; + int tableRTEs, firstTableRTENumber; + RangeTblEntry *rte_in_query = NULL; + AttrNumber attnum; + Oid vartypeid; + int32 type_mod; + Oid varcollid; + + /* + * If the query has more than 1 table RTEs where both are different, we can not add ctid to the query target list + * We should in this case skip adding it to the target list and a WHERE CURRENT OF should then + * fail saying the query is not a simply update able scan of table + */ + + tableRTEs = 0; + foreach(lc1, qry->rtable) + { + rte1 = (RangeTblEntry *) lfirst(lc1); + + if (rte1->rtekind == RTE_RELATION) + { + tableRTEs++; + if (tableRTEs > 1) + { + /* + * See if we get two RTEs in case we have two references + * to the same table with different aliases + */ + foreach(lc2, qry->rtable) + { + rte2 = (RangeTblEntry *) lfirst(lc2); + + if (rte2->rtekind == RTE_RELATION) + { + if (rte2->relid != rte1->relid) + { + return NULL; + } + } + } + continue; + } + rte_in_query = rte1; + } + } + + if (tableRTEs > 1) + { + firstTableRTENumber = 0; + foreach(lc1, qry->rtable) + { + rte1 = (RangeTblEntry *) lfirst(lc1); + firstTableRTENumber++; + if (rte1->rtekind == RTE_RELATION) + { + break; + } + } + } + else + { + firstTableRTENumber = 1; + } + + attnum = specialAttNum("ctid"); + Assert(rte_in_query); + get_rte_attribute_type(rte_in_query, attnum, &vartypeid, &type_mod, &varcollid); + return makeVar(firstTableRTENumber, attnum, vartypeid, type_mod, varcollid, 0); +} + +/* + * Returns whether or not the rtable (and its subqueries) + * only contain pg_catalog entries. + */ +static bool +contains_only_pg_catalog(List *rtable) +{ + ListCell *item; + + /* May be complicated. Before giving up, just check for pg_catalog usage */ + foreach(item, rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(item); + + if (rte->rtekind == RTE_RELATION) + { + if (get_rel_namespace(rte->relid) != PG_CATALOG_NAMESPACE) + return false; + } + else if (rte->rtekind == RTE_SUBQUERY && + !contains_only_pg_catalog(rte->subquery->rtable)) + return false; + } + return true; +} + + +/* + * Returns true if at least one temporary table is in use + * in query (and its subqueries) + */ +static bool +contains_temp_tables(List *rtable) +{ + ListCell *item; + + foreach(item, rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(item); + + if (rte->rtekind == RTE_RELATION) + { + if (IsTempTable(rte->relid)) + return true; + } + else if (rte->rtekind == RTE_SUBQUERY && + contains_temp_tables(rte->subquery->rtable)) + return true; + } + + return false; +} + +/* + * Create an instance of RemoteQuery and initialize fields + */ +static RemoteQuery * +makeRemoteQuery(void) +{ + RemoteQuery *result = makeNode(RemoteQuery); + result->combine_type = COMBINE_TYPE_NONE; + result->exec_type = EXEC_ON_DATANODES; + result->exec_direct_type = EXEC_DIRECT_NONE; + + return result; +} + +/* + * get_plan_combine_type - determine combine type + * + * COMBINE_TYPE_SAME - for replicated updates + * COMBINE_TYPE_SUM - for hash and round robin updates + * COMBINE_TYPE_NONE - for operations where row_count is not applicable + * + * return NULL if it is not safe to be done in a single step. + */ +static CombineType +get_plan_combine_type(Query *query, char baselocatortype) +{ + + switch (query->commandType) + { + case CMD_INSERT: + case CMD_UPDATE: + case CMD_DELETE: + return baselocatortype == LOCATOR_TYPE_REPLICATED ? + COMBINE_TYPE_SAME : COMBINE_TYPE_SUM; + + default: + return COMBINE_TYPE_NONE; + } + /* quiet compiler warning */ + return COMBINE_TYPE_NONE; +} + +/* + * get oid of the function whose name is passed as argument + */ + +static Oid +get_fn_oid(char *fn_name, Oid *p_rettype) +{ + Value *fn_nm; + List *fn_name_list; + FuncDetailCode fdc; + bool retset; + int nvargs; + Oid *true_typeids; + Oid func_oid; + + fn_nm = makeString(fn_name); + fn_name_list = list_make1(fn_nm); + + fdc = func_get_detail(fn_name_list, + NULL, /* argument expressions */ + NULL, /* argument names */ + 0, /* argument numbers */ + NULL, /* argument types */ + false, /* expand variable number or args */ + false, /* expand defaults */ + &func_oid, /* oid of the function - returned detail*/ + p_rettype, /* function return type - returned detail */ + &retset, /* - returned detail*/ + &nvargs, /* - returned detail*/ + &true_typeids, /* - returned detail */ + NULL /* arguemnt defaults returned*/ + ); + + pfree(fn_name_list); + if (fdc == FUNCDETAIL_NORMAL) + { + return func_oid; + } + return InvalidOid; +} + +/* + * Append ctid to the field list of step queries to support update + * WHERE CURRENT OF. The ctid is not sent down to client but used as a key + * to find target tuple. + * PGXCTODO: Bug + * This function modifies the original query to add ctid + * and nodename in the targetlist. It should rather modify the targetlist of the + * query to be shipped by the RemoteQuery node. + */ +static void +fetch_ctid_of(Plan *subtree, Query *query) +{ + /* recursively process subnodes */ + if (innerPlan(subtree)) + fetch_ctid_of(innerPlan(subtree), query); + if (outerPlan(subtree)) + fetch_ctid_of(outerPlan(subtree), query); + + /* we are only interested in RemoteQueries */ + if (IsA(subtree, RemoteQuery)) + { + RemoteQuery *step = (RemoteQuery *) subtree; + TargetEntry *te1; + Query *temp_qry; + FuncExpr *func_expr; + AttrNumber resno; + Oid funcid; + Oid rettype; + Var *ctid_expr; + MemoryContext oldcontext; + MemoryContext tmpcontext; + + tmpcontext = AllocSetContextCreate(CurrentMemoryContext, + "Temp Context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcontext = MemoryContextSwitchTo(tmpcontext); + + /* Copy the query tree to make changes to the target list */ + temp_qry = copyObject(query); + /* Get the number of entries in the target list */ + resno = list_length(temp_qry->targetList); + + /* Make a ctid column ref expr to add in target list */ + ctid_expr = make_ctid_col_ref(temp_qry); + if (ctid_expr == NULL) + { + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(tmpcontext); + return; + } + + te1 = makeTargetEntry((Expr *)ctid_expr, resno+1, NULL, false); + + /* add the target entry to the query target list */ + temp_qry->targetList = lappend(temp_qry->targetList, te1); + + /* PGXCTODO We can take this call in initialization rather than getting it always */ + + /* Get the Oid of the function */ + funcid = get_fn_oid("pgxc_node_str", &rettype); + if (OidIsValid(funcid)) + { + StringInfoData deparsed_qry; + TargetEntry *te2; + + /* create a function expression */ + func_expr = makeFuncExpr(funcid, rettype, NULL, InvalidOid, InvalidOid, COERCE_DONTCARE); + /* make a target entry for function call */ + te2 = makeTargetEntry((Expr *)func_expr, resno+2, NULL, false); + /* add the target entry to the query target list */ + temp_qry->targetList = lappend(temp_qry->targetList, te2); + + initStringInfo(&deparsed_qry); + deparse_query(temp_qry, &deparsed_qry, NIL); + + MemoryContextSwitchTo(oldcontext); + + if (step->sql_statement != NULL) + pfree(step->sql_statement); + + step->sql_statement = pstrdup(deparsed_qry.data); + + MemoryContextDelete(tmpcontext); + } + else + { + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(tmpcontext); + } + } +} + +/* + * Build up a QueryPlan to execute on. + * + * This functions tries to find out whether + * 1. The statement can be shipped to the Datanode and Coordinator is needed + * only as a proxy - in which case, it creates a single node plan. + * 2. The statement can be evaluated on the Coordinator completely - thus no + * query shipping is involved and standard_planner() is invoked to plan the + * statement + * 3. The statement needs Coordinator as well as Datanode for evaluation - + * again we use standard_planner() to plan the statement. + * + * The plan generated in either of the above cases is returned. + */ +PlannedStmt * +pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) +{ + PlannedStmt *result; + + /* handle the un-supported statements, obvious errors etc. */ + pgxc_handle_unsupported_stmts(query); + + result = pgxc_handle_exec_direct(query, cursorOptions, boundParams); + if (result) + return result; + + /* see if can ship the query completely */ + result = pgxc_FQS_planner(query, cursorOptions, boundParams); + if (result) + return result; + + /* we need Coordinator for evaluation, invoke standard planner */ + result = standard_planner(query, cursorOptions, boundParams); + pgxc_set_remote_parameters(result, boundParams); + return result; +} + +static PlannedStmt * +pgxc_handle_exec_direct(Query *query, int cursorOptions, + ParamListInfo boundParams) +{ + PlannedStmt *result = NULL; + PlannerGlobal *glob; + PlannerInfo *root; + /* + * if the query has its utility set, it could be an EXEC_DIRECT statement, + * check if it needs to be executed on Coordinator + */ + if (query->utilityStmt && + IsA(query->utilityStmt, RemoteQuery)) + { + RemoteQuery *node = (RemoteQuery *)query->utilityStmt; + /* EXECUTE DIRECT statements on remote nodes don't need Coordinator */ + if (node->exec_direct_type != EXEC_DIRECT_NONE && + node->exec_direct_type != EXEC_DIRECT_LOCAL && + node->exec_direct_type != EXEC_DIRECT_LOCAL_UTILITY) + { + glob = makeNode(PlannerGlobal); + glob->boundParams = boundParams; + /* Create a PlannerInfo data structure, usually it is done for a subquery */ + root = makeNode(PlannerInfo); + root->parse = query; + root->glob = glob; + root->query_level = 1; + root->planner_cxt = CurrentMemoryContext; + /* build the PlannedStmt result */ + result = makeNode(PlannedStmt); + /* Try and set what we can, rest must have been zeroed out by makeNode() */ + result->commandType = query->commandType; + result->canSetTag = query->canSetTag; + /* Set result relations */ + if (query->commandType != CMD_SELECT) + result->resultRelations = list_make1_int(query->resultRelation); + + result->planTree = (Plan *)pgxc_FQS_create_remote_plan(query, NULL, true); + result->rtable = query->rtable; + /* + * We need to save plan dependencies, so that dropping objects will + * invalidate the cached plan if it depends on those objects. Table + * dependencies are available in glob->relationOids and all other + * dependencies are in glob->invalItems. These fields can be retrieved + * through set_plan_references(). + */ + result->planTree = set_plan_references(root, result->planTree); + result->relationOids = glob->relationOids; + result->invalItems = glob->invalItems; + } + } + + /* Set existing remote parameters */ + pgxc_set_remote_parameters(result, boundParams); + + return result; +} +/* + * pgxc_handle_unsupported_stmts + * Throw error for the statements that can not be handled in XC + */ +static void +pgxc_handle_unsupported_stmts(Query *query) +{ + /* + * PGXCTODO: This validation will not be removed + * until we support moving tuples from one node to another + * when the partition column of a table is updated + */ + if (query->commandType == CMD_UPDATE) + validate_part_col_updatable(query); + + if (query->returningList) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("RETURNING clause not yet supported")))); +} + +/* + * pgxc_FQS_planner + * The routine tries to see if the statement can be completely evaluated on the + * Datanodes. In such cases Coordinator is not needed to evaluate the statement, + * and just acts as a proxy. A statement can be completely shipped to the remote + * node if every row of the result can be evaluated on a single Datanode. + * For example: + * + * 1. SELECT * FROM tab1; where tab1 is a distributed table - Every row of the + * result set can be evaluated at a single Datanode. Hence this statement is + * completely shippable even though many Datanodes are involved in evaluating + * complete result set. In such case Coordinator will be able to gather rows + * arisign from individual Datanodes and proxy the result to the client. + * + * 2. SELECT count(*) FROM tab1; where tab1 is a distributed table - there is + * only one row in the result but it needs input from all the Datanodes. Hence + * this is not completely shippable. + * + * 3. SELECT count(*) FROM tab1; where tab1 is replicated table - since result + * can be obtained from a single Datanode, this is a completely shippable + * statement. + * + * fqs in the name of function is acronym for fast query shipping. + */ +static PlannedStmt * +pgxc_FQS_planner(Query *query, int cursorOptions, ParamListInfo boundParams) +{ + PlannedStmt *result; + PlannerGlobal *glob; + PlannerInfo *root; + ExecNodes *exec_nodes; + Plan *top_plan; + + /* Try by-passing standard planner, if fast query shipping is enabled */ + if (!enable_fast_query_shipping) + return NULL; + + /* Cursor options may come from caller or from DECLARE CURSOR stmt */ + if (query->utilityStmt && + IsA(query->utilityStmt, DeclareCursorStmt)) + cursorOptions |= ((DeclareCursorStmt *) query->utilityStmt)->options; + /* + * If the query can not be or need not be shipped to the Datanodes, don't + * create any plan here. standard_planner() will take care of it. + */ + exec_nodes = pgxc_is_query_shippable(query, 0); + if (exec_nodes == NULL) + return NULL; + + glob = makeNode(PlannerGlobal); + glob->boundParams = boundParams; + /* Create a PlannerInfo data structure, usually it is done for a subquery */ + root = makeNode(PlannerInfo); + root->parse = query; + root->glob = glob; + root->query_level = 1; + root->planner_cxt = CurrentMemoryContext; + + /* + * We decided to ship the query to the Datanode/s, create a RemoteQuery node + * for the same. + */ + top_plan = (Plan *)pgxc_FQS_create_remote_plan(query, exec_nodes, false); + /* + * If creating a plan for a scrollable cursor, make sure it can run + * backwards on demand. Add a Material node at the top at need. + */ + if (cursorOptions & CURSOR_OPT_SCROLL) + { + if (!ExecSupportsBackwardScan(top_plan)) + top_plan = materialize_finished_plan(top_plan); + } + + /* + * Just before creating the PlannedStmt, do some final cleanup + * We need to save plan dependencies, so that dropping objects will + * invalidate the cached plan if it depends on those objects. Table + * dependencies are available in glob->relationOids and all other + * dependencies are in glob->invalItems. These fields can be retrieved + * through set_plan_references(). + */ + top_plan = set_plan_references(root, top_plan); + + /* build the PlannedStmt result */ + result = makeNode(PlannedStmt); + /* Try and set what we can, rest must have been zeroed out by makeNode() */ + result->commandType = query->commandType; + result->canSetTag = query->canSetTag; + result->utilityStmt = query->utilityStmt; + + /* Set result relations */ + if (query->commandType != CMD_SELECT) + result->resultRelations = list_make1_int(query->resultRelation); + result->planTree = top_plan; + result->rtable = query->rtable; + result->relationOids = glob->relationOids; + result->invalItems = glob->invalItems; + + /* + * If query is DECLARE CURSOR fetch CTIDs and node names from the remote node + * Use CTID as a key to update/delete tuples on remote nodes when handling + * WHERE CURRENT OF. + */ + if (query->utilityStmt && IsA(query->utilityStmt, DeclareCursorStmt)) + fetch_ctid_of(result->planTree, query); + + /* Set existing remote parameters */ + pgxc_set_remote_parameters(result, boundParams); + + return result; +} + +static RemoteQuery * +pgxc_FQS_create_remote_plan(Query *query, ExecNodes *exec_nodes, bool is_exec_direct) +{ + RemoteQuery *query_step; + StringInfoData buf; + RangeTblEntry *dummy_rte; + + /* EXECUTE DIRECT statements have their RemoteQuery node already built when analyzing */ + if (is_exec_direct) + { + Assert(IsA(query->utilityStmt, RemoteQuery)); + query_step = (RemoteQuery *)query->utilityStmt; + query->utilityStmt = NULL; + } + else + { + query_step = makeRemoteQuery(); + query_step->exec_nodes = exec_nodes; + } + + Assert(query_step->exec_nodes); + + /* Datanodes should finalise the results of this query */ + query->qry_finalise_aggs = true; + + /* Deparse query tree to get step query. */ + if ( query_step->sql_statement == NULL ) + { + initStringInfo(&buf); + deparse_query(query, &buf, NIL); + query_step->sql_statement = pstrdup(buf.data); + pfree(buf.data); + } + /* + * PGXCTODO: we may route this same Query structure through + * standard_planner, where we don't want Datanodes to finalise the results. + * Turn it off. At some point, we will avoid routing the same query + * structure through the standard_planner by modifying it only when it's not + * be routed through standard_planner. + */ + query->qry_finalise_aggs = false; + /* Optimize multi-node handling */ + query_step->read_only = (query->commandType == CMD_SELECT && !query->hasForUpdate); + query_step->has_row_marks = query->hasForUpdate; + + /* Check if temporary tables are in use in query */ + /* PGXC_FQS_TODO: scanning the rtable again for the queries should not be + * needed. We should be able to find out if the query has a temporary object + * while finding nodes for the objects. But there is no way we can convey + * that information here. Till such a connection is available, this is it. + */ + if (contains_temp_tables(query->rtable)) + query_step->is_temp = true; + + /* + * We need to evaluate some expressions like the ExecNodes->en_expr at + * Coordinator, prepare those for evaluation. Ideally we should call + * preprocess_expression, but it needs PlannerInfo structure for the same + */ + fix_opfuncids((Node *)(query_step->exec_nodes->en_expr)); + /* + * PGXCTODO + * When Postgres runs insert into t (a) values (1); against table + * defined as create table t (a int, b int); the plan is looking + * like insert into t (a,b) values (1,null); + * Later executor is verifying plan, to make sure table has not + * been altered since plan has been created and comparing table + * definition with plan target list and output error if they do + * not match. + * I could not find better way to generate targetList for pgxc plan + * then call standard planner and take targetList from the plan + * generated by Postgres. + */ + query_step->combine_type = get_plan_combine_type( + query, query_step->exec_nodes->baselocatortype); + + /* + * Create a dummy RTE for the remote query being created. Append the dummy + * range table entry to the range table. Note that this modifies the master + * copy the caller passed us, otherwise e.g EXPLAIN VERBOSE will fail to + * find the rte the Vars built below refer to. Also create the tuple + * descriptor for the result of this query from the base_tlist (targetlist + * we used to generate the remote node query). + */ + dummy_rte = makeNode(RangeTblEntry); + dummy_rte->rtekind = RTE_REMOTE_DUMMY; + /* Use a dummy relname... */ + if (is_exec_direct) + dummy_rte->relname = "__EXECUTE_DIRECT__"; + else + dummy_rte->relname = "__REMOTE_FQS_QUERY__"; + dummy_rte->eref = makeAlias("__REMOTE_FQS_QUERY__", NIL); + /* Rest will be zeroed out in makeNode() */ + + query->rtable = lappend(query->rtable, dummy_rte); + query_step->scan.scanrelid = list_length(query->rtable); + query_step->scan.plan.targetlist = query->targetList; + query_step->base_tlist = query->targetList; + + return query_step; +} + +/* + * pgxc_query_needs_coord + * Check if the query needs Coordinator for evaluation or it can be completely + * evaluated on Coordinator. Return true if so, otherwise return false. + */ +static bool +pgxc_query_needs_coord(Query *query) +{ + /* + * If the query is an EXEC DIRECT on the same Coordinator where it's fired, + * it should not be shipped + */ + if (query->is_local) + return true; + /* + * If the query involves just the catalog tables, and is not an EXEC DIRECT + * statement, it can be evaluated completely on the Coordinator. No need to + * involve Datanodes. + */ + if (contains_only_pg_catalog(query->rtable)) + return true; + + + /* Allow for override */ + if (query->commandType != CMD_SELECT && + query->commandType != CMD_INSERT && + query->commandType != CMD_UPDATE && + query->commandType != CMD_DELETE) + { + if (StrictStatementChecking) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("This command is not yet supported.")))); + + return true; + } + + return false; +} + +/* + * Set the given reason in Shippability_context indicating why the query can not be + * shipped directly to the Datanodes. + */ +static void +pgxc_set_shippability_reason(Shippability_context *context, ShippabilityStat reason) +{ + context->sc_shippability = bms_add_member(context->sc_shippability, reason); +} + +/* + * See if a given reason is why the query can not be shipped directly + * to the Datanodes. + */ +bool +pgxc_test_shippability_reason(Shippability_context *context, ShippabilityStat reason) +{ + return bms_is_member(reason, context->sc_shippability); +} + +/* + * pgxc_is_query_shippable + * This function calls the query walker to analyse the query to gather + * information like Constraints under which the query can be shippable, nodes + * on which the query is going to be executed etc. + * Based on the information gathered, it decides whether the query can be + * executed on Datanodes directly without involving Coordinator. + * If the query is shippable this routine also returns the nodes where the query + * should be shipped. If the query is not shippable, it returns NULL. + */ +static ExecNodes * +pgxc_is_query_shippable(Query *query, int query_level) +{ + Shippability_context sc_context; + ExecNodes *exec_nodes; + bool canShip = true; + Bitmapset *shippability; + + memset(&sc_context, 0, sizeof(sc_context)); + /* let's assume that by default query is shippable */ + sc_context.sc_query = query; + sc_context.sc_query_level = query_level; + sc_context.sc_for_expr = false; + + /* + * We might have already decided not to ship the query to the Datanodes, but + * still walk it anyway to find out if there are any subqueries which can be + * shipped. + */ + pgxc_shippability_walker((Node *)query, &sc_context); + /* + * We have merged the nodelists and distributions of all subqueries seen in + * the query tree, merge it with the same obtained for the relations + * involved in the query. + * PGXC_FQS_TODO: + * Merge the subquery ExecNodes if both of them are replicated. + * The logic to merge node lists with other distribution + * strategy is not clear yet. + */ + exec_nodes = sc_context.sc_exec_nodes; + if (exec_nodes) + exec_nodes = pgxc_merge_exec_nodes(exec_nodes, + sc_context.sc_subquery_en, false, + true); + + /* + * Look at the information gathered by the walker in Shippability_context and that + * in the Query structure to decide whether we should ship this query + * directly to the Datanode or not + */ + + /* + * If the planner was not able to find the Datanodes to the execute the + * query, the query is not completely shippable. So, return NULL + */ + if (!exec_nodes) + return NULL; + + /* Copy the shippability reasons. We modify the copy for easier handling. + * The original can be saved away */ + shippability = bms_copy(sc_context.sc_shippability); + + /* + * If the query has an expression which renders the shippability to single + * node, and query needs to be shipped to more than one node, it can not be + * shipped + */ + if (bms_is_member(SS_NEED_SINGLENODE, shippability)) + { + /* We handled the reason here, reset it */ + shippability = bms_del_member(shippability, SS_NEED_SINGLENODE); + /* if nodeList has no nodes, it ExecNodes will have other means to know + * the nodes where to execute like distribution column expression. We + * can't tell how many nodes the query will be executed on, hence treat + * that as multiple nodes. + */ + if (list_length(exec_nodes->nodeList) != 1) + canShip = false; + } + /* We have delt with aggregates as well, delete the Has aggregates status */ + shippability = bms_del_member(shippability, SS_HAS_AGG_EXPR); + + /* Can not ship the query for some reason */ + if (!bms_is_empty(shippability)) + canShip = false; + + /* Always keep this at the end before checking canShip and return */ + if (!canShip && exec_nodes) + FreeExecNodes(&exec_nodes); + /* If query is to be shipped, we should know where to execute the query */ + Assert (!canShip || exec_nodes); + + bms_free(shippability); + shippability = NULL; + + return exec_nodes; +} + +/* + * pgxc_merge_exec_nodes + * The routine combines the two exec_nodes passed such that the resultant + * exec_node corresponds to the JOIN of respective relations. + * If both exec_nodes can not be merged, it returns NULL. + */ +static ExecNodes * +pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2, bool merge_dist_equijoin, + bool merge_replicated_only) +{ + ExecNodes *merged_en = makeNode(ExecNodes); + ExecNodes *tmp_en; + + /* If either of exec_nodes are NULL, return the copy of other one */ + if (!en1) + { + tmp_en = copyObject(en2); + return tmp_en; + } + if (!en2) + { + tmp_en = copyObject(en1); + return tmp_en; + } + + /* Following cases are not handled in this routine */ + /* PGXC_FQS_TODO how should we handle table usage type? */ + if (en1->primarynodelist || en2->primarynodelist || + en1->en_expr || en2->en_expr || + OidIsValid(en1->en_relid) || OidIsValid(en2->en_relid) || + en1->accesstype != RELATION_ACCESS_READ || en2->accesstype != RELATION_ACCESS_READ) + return NULL; + + if (IsLocatorReplicated(en1->baselocatortype) && + IsLocatorReplicated(en2->baselocatortype)) + { + /* + * Replicated/replicated join case + * Check that replicated relation is not disjoint + * with initial relation which is also replicated. + * If there is a common portion of the node list between + * the two relations, other rtables have to be checked on + * this restricted list. + */ + merged_en->nodeList = list_intersection_int(en1->nodeList, + en2->nodeList); + merged_en->baselocatortype = LOCATOR_TYPE_REPLICATED; + /* No intersection, so has to go though standard planner... */ + if (!merged_en->nodeList) + FreeExecNodes(&merged_en); + return merged_en; + } + + /* + * We are told to merge the nodelists if both the distributions are + * replicated. We checked that above, so bail out + */ + if (merge_replicated_only) + { + FreeExecNodes(&merged_en); + return merged_en; + } + + if (IsLocatorReplicated(en1->baselocatortype) && + IsLocatorColumnDistributed(en2->baselocatortype)) + { + List *diff_nodelist = NULL; + /* + * Replicated/distributed join case. + * Node list of distributed table has to be included + * in node list of replicated table. + */ + diff_nodelist = list_difference_int(en2->nodeList, en1->nodeList); + /* + * If the difference list is not empty, this means that node list of + * distributed table is not completely mapped by node list of replicated + * table, so go through standard planner. + */ + if (diff_nodelist) + FreeExecNodes(&merged_en); + else + { + merged_en->nodeList = list_copy(en2->nodeList); + merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED; + } + return merged_en; + } + + if (IsLocatorColumnDistributed(en1->baselocatortype) && + IsLocatorReplicated(en2->baselocatortype)) + { + List *diff_nodelist = NULL; + /* + * Distributed/replicated join case. + * Node list of distributed table has to be included + * in node list of replicated table. + */ + diff_nodelist = list_difference_int(en1->nodeList, en2->nodeList); + + /* + * If the difference list is not empty, this means that node list of + * distributed table is not completely mapped by node list of replicated + * table, so go through standard planner. + */ + if (diff_nodelist) + FreeExecNodes(&merged_en); + else + { + merged_en->nodeList = list_copy(en1->nodeList); + merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED; + } + return merged_en; + } + + if (IsLocatorColumnDistributed(en1->baselocatortype) && + IsLocatorColumnDistributed(en2->baselocatortype)) + { + /* + * Distributed/distributed case + * If the caller has suggested that this is an equi-join between two + * distributed results, check if both are distributed by the same + * distribution strategy, and have the same nodes in the distribution + * node list. The caller should have made sure that distribution column + * type is same. + */ + if (merge_dist_equijoin && + en1->baselocatortype == en2->baselocatortype && + !list_difference_int(en1->nodeList, en2->nodeList) && + !list_difference_int(en2->nodeList, en1->nodeList)) + { + merged_en->nodeList = list_copy(en1->nodeList); + merged_en->baselocatortype = en1->baselocatortype; + } + else if (list_length(en1->nodeList) == 1 && list_length(en2->nodeList) == 1) + { + merged_en->nodeList = list_intersection_int(en1->nodeList, + en2->nodeList); + merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED; + } + else + FreeExecNodes(&merged_en); + return merged_en; + } + + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), +#ifdef XCP + errmsg("Postgres-XL does not support this distribution type yet"), +#else + errmsg("Postgres-XC does not support this distribution type yet"), +#endif + errdetail("The feature is not currently supported"))); + + /* Keep compiler happy */ + return NULL; +} + +static void +pgxc_FQS_find_datanodes(Shippability_context *sc_context) +{ + Query *query = sc_context->sc_query; + ListCell *rt; + ExecNodes *exec_nodes = NULL; + bool canShip = true; + Index varno = 0; + + /* No query, no nodes to execute! */ + if (!query) + { + sc_context->sc_exec_nodes = NULL; + return; + } + + /* + * For every range table entry, + * 1. Find out the Datanodes needed for that range table + * 2. Merge these Datanodes with the already available Datanodes + * 3. If the merge is unsuccessful, we can not ship this query directly to + * the Datanode/s + */ + foreach(rt, query->rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(rt); + Oid distcol_type; /* TODO mostly this is not needed */ + Relids dist_varnos; + + varno++; + switch (rte->rtekind) + { + case RTE_RELATION: + { + ExecNodes *rel_exec_nodes; + ExecNodes *tmp_en; + bool merge_dist_equijoin = false; + /* + * In case of inheritance, child tables can have completely different + * Datanode distribution than parent. To handle inheritance we need + * to merge the Datanodes of the children table as well. The inheritance + * is resolved during planning(?), so we may not have the RTEs of the + * children here. Also, the exact method of merging Datanodes of the + * children is not known yet. So, when inheritance is requested, query + * can not be shipped. + */ + if (rte->inh) + { + /* + * See prologue of has_subclass, we might miss on the + * optimization because has_subclass can return true + * even if there aren't any subclasses, but it's ok + */ + if (has_subclass(rte->relid)) + { + canShip = false; + break; + } + } + + if (rte->relkind != RELKIND_RELATION) + { + canShip = false; + break; + } + rel_exec_nodes = pgxc_FQS_get_relation_nodes(rte,varno, query); + if (!rel_exec_nodes) + { + /* + * No information about the location of relation in XC, + * a local table OR system catalog. The query can not be + * pushed. + */ + canShip = false; + break; + } + if (varno == 1) + { + if (IsLocatorColumnDistributed(rel_exec_nodes->baselocatortype)) + { + RelationLocInfo *rel_loc_info = GetRelationLocInfo(rte->relid); + distcol_type = get_atttype(rte->relid, + rel_loc_info->partAttrNum); + dist_varnos = bms_make_singleton(varno); + } + else + { + distcol_type = InvalidOid; + dist_varnos = NULL; + } + } + if (exec_nodes && + IsLocatorDistributedByValue(exec_nodes->baselocatortype) && + OidIsValid(distcol_type) && bms_num_members(dist_varnos) > 0 && + exec_nodes->baselocatortype == rel_exec_nodes->baselocatortype) + { + /* + * If the already reduced JOINs is distributed the same way + * as the current relation, check if there exists an + * equi-join condition between the relations and the data type + * of distribution column involved is same for both the + * relations + */ + if (pgxc_qual_hash_dist_equijoin(dist_varnos, + bms_make_singleton(varno), + distcol_type, + query->jointree->quals, + query->rtable)) + merge_dist_equijoin = true; + } + + /* Save the current exec_nodes to be freed later */ + tmp_en = exec_nodes; + exec_nodes = pgxc_merge_exec_nodes(exec_nodes, rel_exec_nodes, + merge_dist_equijoin, + false); + /* + * The JOIN is equijoin between distributed tables, and we could + * obtain the nodelist for pushing this JOIN, so add the current + * relation to the list of relations already JOINed in the same + * fashion. + */ + if (exec_nodes && merge_dist_equijoin) + dist_varnos = bms_add_member(dist_varnos, varno); + FreeExecNodes(&tmp_en); + } + break; + + case RTE_JOIN: + /* Is information here useful in some or other way? */ + break; + case RTE_CTE: + case RTE_SUBQUERY: + case RTE_FUNCTION: + case RTE_VALUES: + default: + canShip = false; + } + + if (!canShip || !exec_nodes) + break; + } + + /* + * If we didn't find the Datanodes to ship the query to, we shouldn't ship + * the query :) + */ + if (!exec_nodes || !(exec_nodes->nodeList || exec_nodes->en_expr)) + canShip = false; + + if (canShip) + { + /* + * If relations involved in the query are such that ultimate JOIN is + * replicated JOIN, choose only one of them. If one of them is a + * preferred node choose that one, otherwise choose the first one. + */ + if (IsLocatorReplicated(exec_nodes->baselocatortype) && + exec_nodes->accesstype == RELATION_ACCESS_READ) + { + List *tmp_list = exec_nodes->nodeList; + ListCell *item; + int nodeid = -1; + foreach(item, exec_nodes->nodeList) + { + int cnt_nodes; + for (cnt_nodes = 0; + cnt_nodes < num_preferred_data_nodes && nodeid < 0; + cnt_nodes++) + { + if (PGXCNodeGetNodeId(preferred_data_node[cnt_nodes], + PGXC_NODE_DATANODE) == lfirst_int(item)) + nodeid = lfirst_int(item); + } + if (nodeid >= 0) + break; + } + if (nodeid < 0) + exec_nodes->nodeList = list_make1_int(linitial_int(exec_nodes->nodeList)); + else + exec_nodes->nodeList = list_make1_int(nodeid); + list_free(tmp_list); + } + sc_context->sc_exec_nodes = exec_nodes; + } + else if (exec_nodes) + { + FreeExecNodes(&exec_nodes); + } + return; +} + +static bool +pgxc_qual_hash_dist_equijoin(Relids varnos_1, Relids varnos_2, Oid distcol_type, + Node *quals, List *rtable) +{ + List *lquals; + ListCell *qcell; + + /* + * Make a copy of the argument bitmaps, it will be modified by + * bms_first_member(). + */ + varnos_1 = bms_copy(varnos_1); + varnos_2 = bms_copy(varnos_2); + + lquals = make_ands_implicit((Expr *)quals); + foreach(qcell, lquals) + { + Expr *qual_expr = (Expr *)lfirst(qcell); + OpExpr *op; + Var *lvar; + Var *rvar; + + if (!IsA(qual_expr, OpExpr)) + continue; + op = (OpExpr *)qual_expr; + /* If not a binary operator, it can not be '='. */ + if (list_length(op->args) != 2) + continue; + + /* + * Check if both operands are Vars, if not check next expression */ + if (IsA(linitial(op->args), Var) && IsA(lsecond(op->args), Var)) + { + lvar = (Var *)linitial(op->args); + rvar = (Var *)lsecond(op->args); + } + else + continue; + + /* + * If the data types of both the columns are not same, continue. Hash + * and Modulo of a the same bytes will be same if the data types are + * same. So, only when the data types of the columns are same, we can + * ship a distributed JOIN to the Datanodes + */ + if (exprType((Node *)lvar) != exprType((Node *)rvar)) + continue; + + /* if the vars do not correspond to the required varnos, continue. */ + if ((bms_is_member(lvar->varno, varnos_1) && bms_is_member(rvar->varno, varnos_2)) || + (bms_is_member(lvar->varno, varnos_2) && bms_is_member(rvar->varno, varnos_1))) + { + if (!VarAttrIsPartAttr(lvar, rtable) || + !VarAttrIsPartAttr(rvar, rtable)) + continue; + } + else + continue; + /* + * If the operator is not an assignment operator, check next + * constraint. An operator is an assignment operator if it's + * mergejoinable or hashjoinable. Beware that not every assignment + * operator is mergejoinable or hashjoinable, so we might leave some + * oportunity. But then we have to rely on the opname which may not + * be something we know to be equality operator as well. + */ + if (!op_mergejoinable(op->opno, exprType((Node *)lvar)) && + !op_hashjoinable(op->opno, exprType((Node *)lvar))) + continue; + /* Found equi-join condition on distribution columns */ + return true; + } + return false; +} + +static bool VarAttrIsPartAttr(Var *var, List *rtable) +{ + RangeTblEntry *rte = rt_fetch(var->varno, rtable); + RelationLocInfo *rel_loc_info; + /* distribution column only applies to the relations */ + if (rte->rtekind != RTE_RELATION || + rte->relkind != RELKIND_RELATION) + return false; + rel_loc_info = GetRelationLocInfo(rte->relid); + if (!rel_loc_info) + return false; + if (var->varattno == rel_loc_info->partAttrNum) + return true; + return false; +} +/* + * pgxc_FQS_get_relation_nodes + * For FQS return ExecNodes structure so as to decide which Datanodes the query + * should execute on. If it is possible to set the node list directly, set it. + * Otherwise set the appropriate distribution column expression or relid in + * ExecNodes structure. + */ +static ExecNodes * +pgxc_FQS_get_relation_nodes(RangeTblEntry *rte, Index varno, Query *query) +{ + CmdType command_type = query->commandType; + bool for_update = query->rowMarks ? true : false; + ExecNodes *rel_exec_nodes; + RelationAccessType rel_access = RELATION_ACCESS_READ; + RelationLocInfo *rel_loc_info; + + Assert(rte == rt_fetch(varno, (query->rtable))); + + switch (command_type) + { + case CMD_SELECT: + if (for_update) + rel_access = RELATION_ACCESS_READ_FOR_UPDATE; + else + rel_access = RELATION_ACCESS_READ; + break; + + case CMD_UPDATE: + case CMD_DELETE: + rel_access = RELATION_ACCESS_UPDATE; + break; + + case CMD_INSERT: + rel_access = RELATION_ACCESS_INSERT; + break; + + default: + /* should not happen, but */ + elog(ERROR, "Unrecognised command type %d", command_type); + break; + } + + + rel_loc_info = GetRelationLocInfo(rte->relid); + /* If we don't know about the distribution of relation, bail out */ + if (!rel_loc_info) + return NULL; + + /* + * Find out the datanodes to execute this query on. + * PGXC_FQS_TODO: for now, we apply node reduction only when there is only + * one relation involved in the query. If there are multiple distributed + * tables in the query and we apply node reduction here, we may fail to ship + * the entire join. We should apply node reduction transitively. + */ + if (list_length(query->rtable) == 1) + rel_exec_nodes = GetRelationNodesByQuals(rte->relid, varno, + query->jointree->quals, rel_access); + else + rel_exec_nodes = GetRelationNodes(rel_loc_info, (Datum) 0, + true, InvalidOid, rel_access); + + if (!rel_exec_nodes) + return NULL; + rel_exec_nodes->accesstype = rel_access; + /* + * If we are reading a replicated table, pick all the nodes where it + * resides. If the query has JOIN, it helps picking up a matching set of + * Datanodes for that JOIN. FQS planner will ultimately pick up one node if + * the JOIN is replicated. + */ + if (rel_access == RELATION_ACCESS_READ && + IsLocatorReplicated(rel_loc_info->locatorType)) + { + list_free(rel_exec_nodes->nodeList); + rel_exec_nodes->nodeList = list_copy(rel_loc_info->nodeList); + } + else if (rel_access == RELATION_ACCESS_INSERT && + IsLocatorDistributedByValue(rel_loc_info->locatorType)) + { + ListCell *lc; + TargetEntry *tle; + /* + * If the INSERT is happening on a table distributed by value of a + * column, find out the + * expression for distribution column in the targetlist, and stick in + * in ExecNodes, and clear the nodelist. Execution will find + * out where to insert the row. + */ + /* It is a partitioned table, get value by looking in targetList */ + foreach(lc, query->targetList) + { + tle = (TargetEntry *) lfirst(lc); + + if (tle->resjunk) + continue; + if (strcmp(tle->resname, rel_loc_info->partAttrName) == 0) + break; + } + /* Not found, bail out */ + if (!lc) + return NULL; + + Assert(tle); + /* We found the TargetEntry for the partition column */ + list_free(rel_exec_nodes->primarynodelist); + rel_exec_nodes->primarynodelist = NULL; + list_free(rel_exec_nodes->nodeList); + rel_exec_nodes->nodeList = NULL; + rel_exec_nodes->en_expr = tle->expr; + rel_exec_nodes->en_relid = rel_loc_info->relid; + } + return rel_exec_nodes; +} +/* + * pgxc_shippability_walker + * walks the query/expression tree routed at the node passed in, gathering + * information which will help decide whether the query to which this node + * belongs is shippable to the Datanodes. + * + * The function should try to walk the entire tree analysing each subquery for + * shippability. If a subquery is shippable but not the whole query, we would be + * able to create a RemoteQuery node for that subquery, shipping it to the + * Datanode. + * + * Return value of this function is governed by the same rules as + * expression_tree_walker(), see prologue of that function for details. + */ +bool +pgxc_shippability_walker(Node *node, Shippability_context *sc_context) +{ + if (node == NULL) + return false; + + /* Below is the list of nodes that can appear in a query, examine each + * kind of node and find out under what conditions query with this node can + * be shippable. For each node, update the context (add fields if + * necessary) so that decision whether to FQS the query or not can be made. + */ + switch(nodeTag(node)) + { + /* Constants are always shippable */ + case T_Const: + break; + + /* + * For placeholder nodes the shippability of the node, depends upon the + * expression which they refer to. It will be checked separately, when + * that expression is encountered. + */ + case T_CaseTestExpr: + break; + + /* + * record_in() function throws error, thus requesting a result in the + * form of anonymous record from datanode gets into error. Hence, if the + * top expression of a target entry is ROW(), it's not shippable. + */ + case T_TargetEntry: + { + TargetEntry *tle = (TargetEntry *)node; + if (tle->expr) + { + char typtype = get_typtype(exprType((Node *)tle->expr)); + if (!typtype || typtype == TYPTYPE_PSEUDO) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + } + } + break; + + case T_SortGroupClause: + if (sc_context->sc_for_expr) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + break; + + /* + * Nodes, which are shippable if the tree rooted under these nodes is + * shippable + */ + case T_List: + case T_CoerceToDomainValue: + /* + * PGXCTODO: mostly, CoerceToDomainValue node appears in DDLs, + * do we handle DDLs here? + */ + case T_FieldSelect: + case T_RangeTblRef: + case T_NamedArgExpr: + case T_BoolExpr: + /* + * PGXCTODO: we might need to take into account the kind of boolean + * operator we have in the quals and see if the corresponding + * function is immutable. + */ + case T_RelabelType: + case T_CoerceViaIO: + case T_ArrayCoerceExpr: + case T_ConvertRowtypeExpr: + case T_CaseExpr: + case T_ArrayExpr: + case T_RowExpr: + case T_CollateExpr: + case T_CoalesceExpr: + case T_XmlExpr: + case T_NullTest: + case T_BooleanTest: + case T_CoerceToDomain: + break; + + case T_ArrayRef: + /* + * When multiple values of of an array are updated at once + * FQS planner cannot yet handle SQL representation correctly. + * So disable FQS in this case and let standard planner manage it. + */ + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + break; + + case T_FieldStore: + /* + * PostgreSQL deparsing logic does not handle the FieldStore + * for more than one fields (see processIndirection()). So, let's + * handle it through standard planner, where whole row will be + * constructed. + */ + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + break; + + case T_SetToDefault: + /* + * PGXCTODO: we should actually check whether the default value to + * be substituted is shippable to the Datanode. Some cases like + * nextval() of a sequence can not be shipped to the Datanode, hence + * for now default values can not be shipped to the Datanodes + */ + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + break; + + case T_Var: + { + Var *var = (Var *)node; + /* + * if a subquery references an upper level variable, that query is + * not shippable, if shipped alone. + */ + if (var->varlevelsup > sc_context->sc_max_varlevelsup) + sc_context->sc_max_varlevelsup = var->varlevelsup; + } + break; + + case T_Param: + { + Param *param = (Param *)node; + /* PGXCTODO: Can we handle internally generated parameters? */ + if (param->paramkind != PARAM_EXTERN) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + } + break; + + case T_CurrentOfExpr: + { + /* + * Ideally we should not see CurrentOf expression here, it + * should have been replaced by the CTID = ? expression. But + * still, no harm in shipping it as is. + */ + } + break; + + case T_Aggref: + { + Aggref *aggref = (Aggref *)node; + /* + * An aggregate is completely shippable to the Datanode, if the + * whole group resides on that Datanode. This will be clear when + * we see the GROUP BY clause. + * agglevelsup is minimum of variable's varlevelsup, so we will + * set the sc_max_varlevelsup when we reach the appropriate + * VARs in the tree. + */ + pgxc_set_shippability_reason(sc_context, SS_HAS_AGG_EXPR); + /* + * If a stand-alone expression to be shipped, is an + * 1. aggregate with ORDER BY, DISTINCT directives, it needs all + * the qualifying rows + * 2. aggregate without collection function + * 3. (PGXCTODO:)aggregate with polymorphic transition type, the + * the transition type needs to be resolved to correctly interpret + * the transition results from Datanodes. + * Hence, such an expression can not be shipped to the datanodes. + */ + if (aggref->aggorder || + aggref->aggdistinct || + aggref->agglevelsup || + !aggref->agghas_collectfn || + IsPolymorphicType(aggref->aggtrantype)) + pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE); + } + break; + + case T_FuncExpr: + { + FuncExpr *funcexpr = (FuncExpr *)node; + /* + * PGXC_FQS_TODO: it's too restrictive not to ship non-immutable + * functions to the Datanode. We need a better way to see what + * can be shipped to the Datanode and what can not be. + */ + if (!is_immutable_func(funcexpr->funcid)) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + } + break; + + case T_OpExpr: + case T_DistinctExpr: /* struct-equivalent to OpExpr */ + case T_NullIfExpr: /* struct-equivalent to OpExpr */ + { + /* + * All of these three are structurally equivalent to OpExpr, so + * cast the node to OpExpr and check if the operator function is + * immutable. See PGXC_FQS_TODO item for FuncExpr. + */ + OpExpr *op_expr = (OpExpr *)node; + Oid opfuncid = OidIsValid(op_expr->opfuncid) ? + op_expr->opfuncid : get_opcode(op_expr->opno); + if (!OidIsValid(opfuncid) || !is_immutable_func(opfuncid)) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + } + break; + + case T_ScalarArrayOpExpr: + { + /* + * Check if the operator function is shippable to the Datanode + * PGXC_FQS_TODO: see immutability note for FuncExpr above + */ + ScalarArrayOpExpr *sao_expr = (ScalarArrayOpExpr *)node; + Oid opfuncid = OidIsValid(sao_expr->opfuncid) ? + sao_expr->opfuncid : get_opcode(sao_expr->opno); + if (!OidIsValid(opfuncid) || !is_immutable_func(opfuncid)) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + } + break; + + case T_RowCompareExpr: + case T_MinMaxExpr: + { + /* + * PGXCTODO should we be checking the comparision operator + * functions as well, as we did for OpExpr OR that check is + * unnecessary. Operator functions are always shippable? + * Otherwise this node should be treated similar to other + * "shell" nodes. + */ + } + break; + + case T_Query: + { + Query *query = (Query *)node; + + /* A stand-alone expression containing Query is not shippable */ + if (sc_context->sc_for_expr) + { + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + break; + } + /* We are checking shippability of whole query, go ahead */ + + /* CREATE TABLE AS is not supported in FQS */ + if (query->commandType == CMD_UTILITY && + IsA(query->utilityStmt, CreateTableAsStmt)) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + + if (query->hasRecursive) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + /* + * If the query needs Coordinator for evaluation or the query can be + * completed on Coordinator itself, we don't ship it to the Datanode + */ + if (pgxc_query_needs_coord(query)) + pgxc_set_shippability_reason(sc_context, SS_NEEDS_COORD); + + /* PGXC_FQS_TODO: It should be possible to look at the Query and find out + * whether it can be completely evaluated on the Datanode just like SELECT + * queries. But we need to be careful while finding out the Datanodes to + * execute the query on, esp. for the result relations. If one happens to + * remove/change this restriction, make sure you change + * pgxc_FQS_get_relation_nodes appropriately. + * For now DMLs with single rtable entry are candidates for FQS + */ + if (query->commandType != CMD_SELECT && list_length(query->rtable) > 1) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + + /* + * In following conditions query is shippable when there is only one + * Datanode involved + * 1. the query has aggregagtes + * 2. the query has window functions + * 3. the query has ORDER BY clause + * 4. the query has Distinct clause + * 5. the query has limit and offset clause + * + * PGXC_FQS_TODO: Condition 1 above is really dependent upon the GROUP BY clause. If + * all rows in each group reside on the same Datanode, aggregates can be + * evaluated on that Datanode, thus condition 1 is has aggregates & the rows + * in any group reside on multiple Datanodes. + * PGXC_FQS_TODO: Condition 2 above is really dependent upon whether the distinct + * clause has distribution column in it. If the distinct clause has + * distribution column in it, we can ship DISTINCT clause to the Datanodes. + */ + if (query->hasAggs || query->hasWindowFuncs || query->sortClause || + query->distinctClause || query->groupClause || query->havingQual || + query->limitOffset || query->limitCount) + pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE); + + /* walk the entire query tree to analyse the query */ + if (query_tree_walker(query, pgxc_shippability_walker, sc_context, 0)) + return true; + + /* + * PGXC_FQS_TODO: + * There is a subquery in this query, which references Vars in the upper + * query. For now stop shipping such queries. We should get rid of this + * condition. + */ + if (sc_context->sc_max_varlevelsup != 0) + pgxc_set_shippability_reason(sc_context, SS_VARLEVEL); + + /* + * Walk the RangeTableEntries of the query and find the + * Datanodes needed for evaluating this query + */ + pgxc_FQS_find_datanodes(sc_context); + } + break; + + case T_FromExpr: + { + /* We don't expect FromExpr in a stand-alone expression */ + if (sc_context->sc_for_expr) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + + /* + * We will be examining the range table entries separately and + * Join expressions are not candidate for FQS. + * If this is an INSERT query with quals, resulting from say + * conditional rule, we can not handle those in FQS, since there is + * not SQL representation for such quals. + */ + if (sc_context->sc_query->commandType == CMD_INSERT && + ((FromExpr *)node)->quals) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + + } + break; + + case T_WindowFunc: + { + WindowFunc *winf = (WindowFunc *)node; + /* + * A window function can be evaluated on a Datanode if there is + * only one Datanode involved. + */ + pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE); + + /* + * A window function is not shippable as part of a stand-alone + * expression. If the window function is non-immutable, it can not + * be shipped to the datanodes. + */ + if (sc_context->sc_for_expr || + !is_immutable_func(winf->winfnoid)) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + } + break; + + case T_WindowClause: + { + /* + * A window function can be evaluated on a Datanode if there is + * only one Datanode involved. + */ + pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE); + + /* + * A window function is not shippable as part of a stand-alone + * expression + */ + if (sc_context->sc_for_expr) + pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR); + } + break; + + case T_JoinExpr: + /* We don't expect JoinExpr in a stand-alone expression */ + if (sc_context->sc_for_expr) + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + + /* + * For JoinExpr in a Query + * The compatibility of joining ranges will be deduced while + * examining the range table of the query. Nothing to do here + */ + break; + + case T_SubLink: + { + SubLink *sublink = (SubLink *)node; + ExecNodes *sublink_en; + /* + * Walk the query and find the nodes where the query should be + * executed and node distribution. Merge this with the existing + * node list obtained for other subqueries. If merging fails, we + * can not ship the whole query. + */ + if (IsA(sublink->subselect, Query)) + sublink_en = pgxc_is_query_shippable((Query *)(sublink->subselect), + sc_context->sc_query_level); + else + sublink_en = NULL; + + /* PGXCTODO free the old sc_subquery_en. */ + /* If we already know that this query does not have a set of nodes + * to evaluate on, don't bother to merge again. + */ + if (!pgxc_test_shippability_reason(sc_context, SS_NO_NODES)) + { + sc_context->sc_subquery_en = pgxc_merge_exec_nodes(sublink_en, + sc_context->sc_subquery_en, + false, + true); + if (!sc_context->sc_subquery_en) + pgxc_set_shippability_reason(sc_context, SS_NO_NODES); + } + } + break; + + case T_SubPlan: + case T_AlternativeSubPlan: + case T_CommonTableExpr: + case T_SetOperationStmt: + case T_PlaceHolderVar: + case T_AppendRelInfo: + case T_PlaceHolderInfo: + { + /* PGXCTODO: till we exhaust this list */ + pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR); + } + break; + + default: + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(node)); + break; + } + return expression_tree_walker(node, pgxc_shippability_walker, (void *)sc_context); +} + +/* + * See if we can reduce the passed in RemoteQuery nodes to a single step. + * + * We need to check when we can further collapse already collapsed nodes. + * We cannot always collapse- we do not want to allow a replicated table + * to be used twice. That is if we have + * + * partitioned_1 -- replicated -- partitioned_2 + * + * partitioned_1 and partitioned_2 cannot (usually) be safely joined only + * locally. + * We can do this by checking (may need tracking) what type it is, + * and looking at context->conditions->replicated_joins + * + * The following cases are possible, and whether or not it is ok + * to reduce. + * + * If the join between the two RemoteQuery nodes is replicated + * + * Node 1 Node 2 + * rep-part folded rep-part folded ok to reduce? + * 0 0 0 1 1 + * 0 0 1 1 1 + * 0 1 0 1 1 + * 0 1 1 1 1 + * 1 1 1 1 0 + * + * + * If the join between the two RemoteQuery nodes is replicated - partitioned + * + * Node 1 Node 2 + * rep-part folded rep-part folded ok to reduce? + * 0 0 0 1 1 + * 0 0 1 1 0 + * 0 1 0 1 1 + * 0 1 1 1 0 + * 1 1 1 1 0 + * + * + * If the join between the two RemoteQuery nodes is partitioned - partitioned + * it is always reducibile safely, + * + * RemoteQuery *innernode - the inner node + * RemoteQuery *outernode - the outer node + * List *rtable_list - rtables + * JoinPath *join_path - used to examine join restrictions + * PGXCJoinInfo *join_info - contains info about the join reduction + * join_info->partitioned_replicated is set to true if we have a partitioned-replicated + * join. We want to use replicated tables with non-replicated + * tables ony once. Only use this value if this function + * returns true. + */ +ExecNodes * +IsJoinReducible(RemoteQuery *innernode, RemoteQuery *outernode, Relids in_relids, Relids out_relids, + Join *join, JoinPath *join_path, List *rtables) +{ + ExecNodes *join_exec_nodes; + bool merge_dist_equijoin = false; + bool merge_replicated_only; + ListCell *cell; + ExecNodes *inner_en = innernode->exec_nodes; + ExecNodes *outer_en = outernode->exec_nodes; + List *quals = join->joinqual; + + /* + * When join type is other than INNER, we will get the unmatched rows on + * either side. The result will be correct only in case both the sides of + * join are replicated. In case one of the sides is replicated, and the + * unmatched results are not coming from that side, it might be possible to + * ship such join, but this needs to be validated from correctness + * perspective. + */ + merge_replicated_only = (join->jointype != JOIN_INNER); + + /* + * If both the relations are distributed with similar distribution strategy + * walk through the restriction info for this JOIN to find if there is an + * equality condition on the distributed columns of both the relations. In + * such case, we can reduce the JOIN if the distribution nodelist is also + * same. + */ + if (IsLocatorDistributedByValue(inner_en->baselocatortype) && + inner_en->baselocatortype == outer_en->baselocatortype && + !merge_replicated_only) + { + foreach(cell, quals) + { + Node *qual = (Node *)lfirst(cell); + if (pgxc_qual_hash_dist_equijoin(in_relids, out_relids, InvalidOid, + qual, rtables)) + { + merge_dist_equijoin = true; + break; + } + } + } + /* + * If the ExecNodes of inner and outer nodes can be merged, the JOIN is + * shippable + * PGXCTODO: Can we take into consideration the JOIN conditions to optimize + * further? + */ + join_exec_nodes = pgxc_merge_exec_nodes(inner_en, outer_en, + merge_dist_equijoin, + merge_replicated_only); + return join_exec_nodes; +} + +/* + * validate whether partition column of a table is being updated + */ +static void +validate_part_col_updatable(const Query *query) +{ + RangeTblEntry *rte; + RelationLocInfo *rel_loc_info; + ListCell *lc; + + /* Make sure there is one table at least */ + if (query->rtable == NULL) + return; + + rte = (RangeTblEntry *) list_nth(query->rtable, query->resultRelation - 1); + + + if (rte != NULL && rte->relkind != RELKIND_RELATION) + /* Bad relation type */ + return; + + /* See if we have the partitioned case. */ + rel_loc_info = GetRelationLocInfo(rte->relid); + + /* Any column updation on local relations is fine */ + if (!rel_loc_info) + return; + + + /* Only LOCATOR_TYPE_HASH & LOCATOR_TYPE_MODULO should be checked */ + if ( (rel_loc_info->partAttrName != NULL) && + ( (rel_loc_info->locatorType == LOCATOR_TYPE_HASH) || (rel_loc_info->locatorType == LOCATOR_TYPE_MODULO) ) ) + { + /* It is a partitioned table, check partition column in targetList */ + foreach(lc, query->targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(lc); + + if (tle->resjunk) + continue; + + /* + * See if we have a constant expression comparing against the + * designated partitioned column + */ + if (strcmp(tle->resname, rel_loc_info->partAttrName) == 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + (errmsg("Partition column can't be updated in current version")))); + } + } +} + +/* + * AddRemoteQueryNode + * + * Add a Remote Query node to launch on Datanodes. + * This can only be done for a query a Top Level to avoid + * duplicated queries on Datanodes. + */ +List * +AddRemoteQueryNode(List *stmts, const char *queryString, RemoteQueryExecType remoteExecType, bool is_temp) +{ + List *result = stmts; + + /* If node is appplied on EXEC_ON_NONE, simply return the list unchanged */ + if (remoteExecType == EXEC_ON_NONE) + return result; + + /* Only a remote Coordinator is allowed to send a query to backend nodes */ + if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + { + RemoteQuery *step = makeNode(RemoteQuery); + step->combine_type = COMBINE_TYPE_SAME; + step->sql_statement = (char *) queryString; + step->exec_type = remoteExecType; + step->is_temp = is_temp; + result = lappend(result, step); + } + + return result; +} + +/* + * pgxc_query_contains_temp_tables + * + * Check if there is any temporary object used in given list of queries. + */ +bool +pgxc_query_contains_temp_tables(List *queries) +{ + ListCell *elt; + + foreach(elt, queries) + { + Query *query = (Query *) lfirst(elt); + + if (!query) + continue; + + switch(query->commandType) + { + case CMD_SELECT: + case CMD_UPDATE: + case CMD_INSERT: + case CMD_DELETE: + if (contains_temp_tables(query->rtable)) + return true; + default: + break; + } + } + + return false; +} +#endif + + +#ifdef XCP +/* + * AddRemoteQueryNode + * + * Add a Remote Query node to launch on Datanodes. + * This can only be done for a query a Top Level to avoid + * duplicated queries on Datanodes. + */ +List * +AddRemoteQueryNode(List *stmts, const char *queryString, RemoteQueryExecType remoteExecType) +{ + List *result = stmts; + + /* If node is appplied on EXEC_ON_NONE, simply return the list unchanged */ + if (remoteExecType == EXEC_ON_NONE) + return result; + + /* Only a remote Coordinator is allowed to send a query to backend nodes */ + if (remoteExecType == EXEC_ON_CURRENT || + (IS_PGXC_COORDINATOR && !IsConnFromCoord())) + { + RemoteQuery *step = makeNode(RemoteQuery); + step->combine_type = COMBINE_TYPE_SAME; + step->sql_statement = (char *) queryString; + step->exec_type = remoteExecType; + result = lappend(result, step); + } + + return result; +} +#endif + + +/* + * pgxc_direct_planner + * The routine tries to see if the statement can be completely evaluated on the + * datanodes. In such cases coordinator is not needed to evaluate the statement, + * and just acts as a proxy. A statement can be completely shipped to the remote + * node if every row of the result can be evaluated on a single datanode. + * For example: + * + * Only EXECUTE DIRECT statements are sent directly as of now + */ +PlannedStmt * +pgxc_direct_planner(Query *query, int cursorOptions, ParamListInfo boundParams) +{ + PlannedStmt *result; + RemoteQuery *query_step; + + /* build the PlannedStmt result */ + result = makeNode(PlannedStmt); + + /* Try and set what we can */ + result->commandType = query->commandType; + result->canSetTag = query->canSetTag; + result->utilityStmt = query->utilityStmt; + result->rtable = query->rtable; + + /* EXECUTE DIRECT statements have their RemoteQuery node already built when analyzing */ + if (query->utilityStmt + && IsA(query->utilityStmt, RemoteQuery)) + { + RemoteQuery *stmt = (RemoteQuery *) query->utilityStmt; + if (stmt->exec_direct_type != EXEC_DIRECT_NONE) + { + query_step = stmt; + query->utilityStmt = NULL; + result->utilityStmt = NULL; + } + } + + /* Optimize multi-node handling */ + query_step->read_only = query->commandType == CMD_SELECT; + + result->planTree = (Plan *) query_step; + +#ifndef XCP + query->qry_finalise_aggs = false; +#endif + query_step->scan.plan.targetlist = query->targetList; + + return result; +} + +#ifndef XCP +/* + * pgxc_query_contains_utility + * + * Check if there is any utility statement in given list of queries. + */ +bool +pgxc_query_contains_utility(List *queries) +{ + ListCell *elt; + + foreach(elt, queries) + { + Query *query = (Query *) lfirst(elt); + + if (!query) + continue; + + if (query->commandType == CMD_UTILITY) + return true; + } + + return false; +} + + +/* + * pgxc_set_remote_parameters + * + * Set the list of remote parameters for remote plan + */ +static void +pgxc_set_remote_parameters(PlannedStmt *plan, ParamListInfo boundParams) +{ + Oid *param_types; + int cntParam, i; + + /* Leave if no plan */ + if (!plan) + return; + + /* Leave if no parameters */ + if (!boundParams) + return; + + /* + * Count the number of remote parameters available + * We need to take into account all the parameters + * that are prior to the latest available. This insures + * that remote node will not complain about an incorrect + * number of parameter. In case parameters with no types + * are taken into account, they are considered as NULL entries. + */ + cntParam = 0; + for (i = 0; i < boundParams->numParams; i++) + { + if (OidIsValid(boundParams->params[i].ptype)) + cntParam = i + 1; + } + + /* If there are no parameters available, simply leave */ + if (cntParam == 0) + return; + + param_types = (Oid *) palloc(sizeof(Oid) * cntParam); + + /* Then fill the array of types */ + for (i = 0; i < cntParam; i++) + param_types[i] = boundParams->params[i].ptype; + + /* Finally save the parameters in plan */ + SetRemoteStatementName(plan->planTree, NULL, + cntParam, param_types, 0); + + return; +} +#endif diff --git a/src/backend/pgxc/pool/Makefile b/src/backend/pgxc/pool/Makefile index 019c756735..f3830be690 100644 --- a/src/backend/pgxc/pool/Makefile +++ b/src/backend/pgxc/pool/Makefile @@ -14,6 +14,6 @@ subdir = src/backend/pgxc/pool top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = pgxcnode.o execRemote.o poolmgr.o poolcomm.o poolutils.o +OBJS = pgxcnode.o execRemote.o poolmgr.o poolcomm.o postgresql_fdw.o poolutils.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index f19eb0498f..14c75747c0 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -5,6 +5,11 @@ * Functions to execute commands on remote Datanodes * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -31,6 +36,11 @@ #include "libpq/libpq.h" #include "miscadmin.h" #include "pgxc/execRemote.h" +#ifdef XCP +#include "executor/nodeSubplan.h" +#include "nodes/nodeFuncs.h" +#include "pgstat.h" +#endif #include "nodes/nodes.h" #include "nodes/nodeFuncs.h" #include "optimizer/var.h" @@ -52,8 +62,17 @@ /* Enforce the use of two-phase commit when temporary objects are used */ bool EnforceTwoPhaseCommit = true; - +/* + * We do not want it too long, when query is terminating abnormally we just + * want to read in already available data, if datanode connection will reach a + * consistent state after that, we will go normal clean up procedure: send down + * ABORT etc., if data node is not responding we will signal pooler to drop + * the connection. + * It is better to drop and recreate datanode connection then wait for several + * seconds while it being cleaned up when, for example, cancelling query. + */ #define END_QUERY_TIMEOUT 20 +#ifndef XCP #define ROLLBACK_RESP_LEN 9 typedef enum RemoteXactNodeStatus @@ -108,6 +127,7 @@ typedef struct RemoteXactState } RemoteXactState; static RemoteXactState remoteXactState; +#endif #ifdef PGXC typedef struct @@ -124,6 +144,7 @@ typedef struct #define COPY_BUFFER_SIZE 8192 #define PRIMARY_NODE_WRITEAHEAD 1024 * 1024 +#ifndef XCP /* * List of PGXCNodeHandle to track readers and writers involved in the * current transaction @@ -131,6 +152,7 @@ typedef struct static List *XactWriteNodes; static List *XactReadNodes; static char *preparedNodes; +#endif /* * Flag to track if a temporary object is accessed by the current transaction @@ -148,39 +170,169 @@ static PGXCNodeAllHandles * get_exec_connections(RemoteQueryState *planstate, ExecNodes *exec_nodes, RemoteQueryExecType exec_type); +#ifndef XCP static void close_node_cursors(PGXCNodeHandle **connections, int conn_count, char *cursor); static int pgxc_get_transaction_nodes(PGXCNodeHandle *connections[], int size, bool writeOnly); static int pgxc_get_connections(PGXCNodeHandle *connections[], int size, List *connlist); +#endif static bool pgxc_start_command_on_connection(PGXCNodeHandle *connection, RemoteQueryState *remotestate, Snapshot snapshot); +#ifndef XCP static TupleTableSlot * RemoteQueryNext(ScanState *node); static bool RemoteQueryRecheck(RemoteQueryState *node, TupleTableSlot *slot); - static char *generate_begin_command(void); -static bool pgxc_node_remote_prepare(char *prepareGID); +#endif + +#ifdef XCP +static char *pgxc_node_remote_prepare(char *prepareGID, bool localNode); +static bool pgxc_node_remote_finish(char *prepareGID, bool commit, + char *nodestring, GlobalTransactionId gxid, + GlobalTransactionId prepare_gxid); +#else +static bool pgxc_node_remote_prepare(char *prepareGID, bool localNode); +static char *pgxc_node_get_nodelist(bool localNode); +#endif static void pgxc_node_remote_commit(void); static void pgxc_node_remote_abort(void); -static char *pgxc_node_get_nodelist(bool localNode); +#ifdef XCP +static void pgxc_connections_cleanup(ResponseCombiner *combiner); +static void pgxc_node_report_error(ResponseCombiner *combiner); +#else static void ExecClearTempObjectIncluded(void); static void init_RemoteXactState(bool preparedLocalNode); static void clear_RemoteXactState(void); static void pgxc_node_report_error(RemoteQueryState *combiner); -static TupleTableSlot *getrow_for_tapesort(RemoteQueryState *combiner, - TupleTableSlot *scanslot); -static bool IsReturningDMLOnReplicatedTable(RemoteQuery *rq); -static void SetDataRowForIntParams(TupleTableSlot *slot, RemoteQueryState *rq_state); +#endif + +#ifdef XCP +#define REMOVE_CURR_CONN(combiner) \ + if ((combiner)->current_conn < --((combiner)->conn_count)) \ + { \ + (combiner)->connections[(combiner)->current_conn] = \ + (combiner)->connections[(combiner)->conn_count]; \ + } \ + else \ + (combiner)->current_conn = 0 +#endif + +#define MAX_STATEMENTS_PER_TRAN 10 + +/* Variables to collect statistics */ +static int total_transactions = 0; +static int total_statements = 0; +static int total_autocommit = 0; +static int nonautocommit_2pc = 0; +static int autocommit_2pc = 0; +static int current_tran_statements = 0; +static int *statements_per_transaction = NULL; +static int *nodes_per_transaction = NULL; + +/* + * statistics collection: count a statement + */ +static void +stat_statement() +{ + total_statements++; + current_tran_statements++; +} + +/* + * To collect statistics: count a transaction + */ +static void +stat_transaction(int node_count) +{ + total_transactions++; + + if (!statements_per_transaction) + { + statements_per_transaction = (int *) malloc((MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int)); + memset(statements_per_transaction, 0, (MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int)); + } + if (current_tran_statements > MAX_STATEMENTS_PER_TRAN) + statements_per_transaction[MAX_STATEMENTS_PER_TRAN]++; + else + statements_per_transaction[current_tran_statements]++; + current_tran_statements = 0; + if (node_count > 0 && node_count <= NumDataNodes) + { + if (!nodes_per_transaction) + { + nodes_per_transaction = (int *) malloc(NumDataNodes * sizeof(int)); + memset(nodes_per_transaction, 0, NumDataNodes * sizeof(int)); + } + nodes_per_transaction[node_count - 1]++; + } +} + + +#ifdef NOT_USED +/* + * To collect statistics: count a two-phase commit on nodes + */ +static void +stat_2pc() +{ + if (autocommit) + autocommit_2pc++; + else + nonautocommit_2pc++; +} +#endif + + +/* + * Output collected statistics to the log + */ +static void +stat_log() +{ + elog(DEBUG1, "Total Transactions: %d Total Statements: %d", total_transactions, total_statements); + elog(DEBUG1, "Autocommit: %d 2PC for Autocommit: %d 2PC for non-Autocommit: %d", + total_autocommit, autocommit_2pc, nonautocommit_2pc); + if (total_transactions) + { + if (statements_per_transaction) + { + int i; + + for (i = 0; i < MAX_STATEMENTS_PER_TRAN; i++) + elog(DEBUG1, "%d Statements per Transaction: %d (%d%%)", + i, statements_per_transaction[i], statements_per_transaction[i] * 100 / total_transactions); + } + elog(DEBUG1, "%d+ Statements per Transaction: %d (%d%%)", + MAX_STATEMENTS_PER_TRAN, statements_per_transaction[MAX_STATEMENTS_PER_TRAN], statements_per_transaction[MAX_STATEMENTS_PER_TRAN] * 100 / total_transactions); + if (nodes_per_transaction) + { + int i; + + for (i = 0; i < NumDataNodes; i++) + elog(DEBUG1, "%d Nodes per Transaction: %d (%d%%)", + i + 1, nodes_per_transaction[i], nodes_per_transaction[i] * 100 / total_transactions); + } + } +} + /* * Create a structure to store parameters needed to combine responses from * multiple connections as well as state information */ +#ifdef XCP +void +InitResponseCombiner(ResponseCombiner *combiner, int node_count, + CombineType combine_type) +#else static RemoteQueryState * CreateResponseCombiner(int node_count, CombineType combine_type) +#endif { +#ifndef XCP RemoteQueryState *combiner; /* ResponseComber is a typedef for pointer to ResponseCombinerData */ @@ -190,32 +342,52 @@ CreateResponseCombiner(int node_count, CombineType combine_type) /* Out of memory */ return combiner; } - +#endif combiner->node_count = node_count; combiner->connections = NULL; combiner->conn_count = 0; combiner->combine_type = combine_type; combiner->command_complete_count = 0; combiner->request_type = REQUEST_TYPE_NOT_DEFINED; - combiner->tuple_desc = NULL; combiner->description_count = 0; combiner->copy_in_count = 0; combiner->copy_out_count = 0; + combiner->copy_file = NULL; combiner->errorMessage = NULL; combiner->errorDetail = NULL; - combiner->query_Done = false; + combiner->tuple_desc = NULL; +#ifdef XCP + combiner->probing_primary = false; + combiner->returning_node = InvalidOid; + combiner->currentRow = NULL; +#else combiner->currentRow.msg = NULL; combiner->currentRow.msglen = 0; combiner->currentRow.msgnode = 0; +#endif combiner->rowBuffer = NIL; combiner->tapenodes = NULL; +#ifdef XCP + combiner->merge_sort = false; + combiner->extended_query = false; + combiner->tapemarks = NULL; + combiner->tuplesortstate = NULL; + combiner->cursor = NULL; + combiner->update_cursor = NULL; + combiner->cursor_count = 0; + combiner->cursor_connections = NULL; combiner->remoteCopyType = REMOTE_COPY_NONE; +#else + combiner->initAggregates = true; + combiner->query_Done = false; combiner->copy_file = NULL; combiner->rqs_cmd_id = FirstCommandId; return combiner; +#endif } + /* * Parse out row count from the command status response and convert it to integer */ @@ -311,8 +483,16 @@ create_tuple_desc(char *msg_body, size_t len) * Handle CopyOutCommandComplete ('c') message from a Datanode connection */ static void +#ifdef XCP +HandleCopyOutComplete(ResponseCombiner *combiner) +#else HandleCopyOutComplete(RemoteQueryState *combiner) +#endif { +#ifdef XCP + if (combiner->request_type == REQUEST_TYPE_ERROR) + return; +#endif if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) combiner->request_type = REQUEST_TYPE_COPY_OUT; if (combiner->request_type != REQUEST_TYPE_COPY_OUT) @@ -328,7 +508,11 @@ HandleCopyOutComplete(RemoteQueryState *combiner) * Handle CommandComplete ('C') message from a Datanode connection */ static void +#ifdef XCP +HandleCommandComplete(ResponseCombiner *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn) +#else HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn) +#endif { int digits = 0; EState *estate = combiner->ss.ps.state; @@ -350,11 +534,22 @@ HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len, PG { if (combiner->command_complete_count) { +#ifdef XCP /* - * For comments on why non_fqs_dml is required - * see comments in ExecProcNodeDMLInXC + * Replicated command may succeed on on node and fail on + * another. The example is if distributed table referenced + * by a foreign key constraint defined on a partitioned + * table. If command deletes rows from the replicated table + * they may be referenced on one Datanode but not on other. + * So, replicated command on each Datanode either affects + * proper number of rows, or returns error. Here if + * combiner got an error already, we allow to report it, + * not the scaring data corruption message. */ - if (rowcount != estate->es_processed && !combiner->non_fqs_dml) + if (combiner->errorMessage == NULL && rowcount != estate->es_processed) +#else + if (rowcount != estate->es_processed) +#endif /* There is a consistency issue in the database with the replicated table */ ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), @@ -362,19 +557,53 @@ HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len, PG } else /* first result */ - if (!combiner->non_fqs_dml) - estate->es_processed = rowcount; + estate->es_processed = rowcount; } else - if (!combiner->non_fqs_dml) - estate->es_processed += rowcount; + estate->es_processed += rowcount; } else combiner->combine_type = COMBINE_TYPE_NONE; } /* If response checking is enable only then do further processing */ - +#ifdef XCP + if (conn->ck_resp_rollback) + { + if (strcmp(msg_body, "ROLLBACK") == 0) + { + /* + * Subsequent clean up routine will be checking this flag + * to determine nodes where to send ROLLBACK PREPARED. + * On current node PREPARE has failed and the two-phase record + * does not exist, so clean this flag as if PREPARE was not sent + * to that node and avoid erroneous command. + */ + conn->ck_resp_rollback = false; + /* + * Set the error, if none, to force throwing. + * If there is error already, it will be thrown anyway, do not add + * this potentially confusing message + */ + if (combiner->errorMessage == NULL) + { + combiner->errorMessage = + pstrdup("unexpected ROLLBACK from remote node"); + /* + * ERRMSG_PRODUCER_ERROR + * Messages with this code are replaced by others, if they are + * received, so if node will send relevant error message that + * one will be replaced. + */ + combiner->errorCode[0] = 'X'; + combiner->errorCode[1] = 'X'; + combiner->errorCode[2] = '0'; + combiner->errorCode[3] = '1'; + combiner->errorCode[4] = '0'; + } + } + } +#else if (conn->ck_resp_rollback == RESP_ROLLBACK_CHECK) { conn->ck_resp_rollback = RESP_ROLLBACK_NOT_RECEIVED; @@ -384,6 +613,7 @@ HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len, PG conn->ck_resp_rollback = RESP_ROLLBACK_RECEIVED; } } +#endif combiner->command_complete_count++; } @@ -392,8 +622,16 @@ HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len, PG * Handle RowDescription ('T') message from a Datanode connection */ static bool +#ifdef XCP +HandleRowDescription(ResponseCombiner *combiner, char *msg_body, size_t len) +#else HandleRowDescription(RemoteQueryState *combiner, char *msg_body, size_t len) +#endif { +#ifdef XCP + if (combiner->request_type == REQUEST_TYPE_ERROR) + return false; +#endif if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) combiner->request_type = REQUEST_TYPE_QUERY; if (combiner->request_type != REQUEST_TYPE_QUERY) @@ -441,8 +679,16 @@ HandleParameterStatus(RemoteQueryState *combiner, char *msg_body, size_t len) * Handle CopyInResponse ('G') message from a Datanode connection */ static void +#ifdef XCP +HandleCopyIn(ResponseCombiner *combiner) +#else HandleCopyIn(RemoteQueryState *combiner) +#endif { +#ifdef XCP + if (combiner->request_type == REQUEST_TYPE_ERROR) + return; +#endif if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) combiner->request_type = REQUEST_TYPE_COPY_IN; if (combiner->request_type != REQUEST_TYPE_COPY_IN) @@ -463,8 +709,16 @@ HandleCopyIn(RemoteQueryState *combiner) * Handle CopyOutResponse ('H') message from a Datanode connection */ static void +#ifdef XCP +HandleCopyOut(ResponseCombiner *combiner) +#else HandleCopyOut(RemoteQueryState *combiner) +#endif { +#ifdef XCP + if (combiner->request_type == REQUEST_TYPE_ERROR) + return; +#endif if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) combiner->request_type = REQUEST_TYPE_COPY_OUT; if (combiner->request_type != REQUEST_TYPE_COPY_OUT) @@ -485,8 +739,16 @@ HandleCopyOut(RemoteQueryState *combiner) * Handle CopyOutDataRow ('d') message from a Datanode connection */ static void +#ifdef XCP +HandleCopyDataRow(ResponseCombiner *combiner, char *msg_body, size_t len) +#else HandleCopyDataRow(RemoteQueryState *combiner, char *msg_body, size_t len) +#endif { +#ifdef XCP + if (combiner->request_type == REQUEST_TYPE_ERROR) + return; +#endif if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) combiner->request_type = REQUEST_TYPE_COPY_OUT; @@ -511,6 +773,9 @@ HandleCopyDataRow(RemoteQueryState *combiner, char *msg_body, size_t len) pq_putmessage('d', msg_body, len); break; case REMOTE_COPY_TUPLESTORE: +#ifdef XCP + tuplestore_putmessage(combiner->tuplestorestate, len, msg_body); +#else { Datum *values; bool *nulls; @@ -586,6 +851,7 @@ HandleCopyDataRow(RemoteQueryState *combiner, char *msg_body, size_t len) pfree(in_functions); pfree(typioparams); } +#endif break; case REMOTE_COPY_NONE: default: @@ -595,9 +861,67 @@ HandleCopyDataRow(RemoteQueryState *combiner, char *msg_body, size_t len) /* * Handle DataRow ('D') message from a Datanode connection - * The function returns true if buffer can accept more data rows. - * Caller must stop reading if function returns false + * The function returns true if data row is accepted and successfully stored + * within the combiner. */ +#ifdef XCP +static bool +HandleDataRow(ResponseCombiner *combiner, char *msg_body, size_t len, Oid node) +{ + /* We expect previous message is consumed */ + Assert(combiner->currentRow == NULL); + + if (combiner->request_type == REQUEST_TYPE_ERROR) + return false; + + if (combiner->request_type != REQUEST_TYPE_QUERY) + { + /* Inconsistent responses */ + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("Unexpected response from the data nodes for 'D' message, current request type %d", combiner->request_type))); + } + + /* + * If we got an error already ignore incoming data rows from other nodes + * Still we want to continue reading until get CommandComplete + */ + if (combiner->errorMessage) + return false; + + /* + * Replicated INSERT/UPDATE/DELETE with RETURNING: receive only tuples + * from one node, skip others as duplicates + */ + if (combiner->combine_type == COMBINE_TYPE_SAME) + { + /* Do not return rows when probing primary, instead return when doing + * first normal node. Just save some CPU and traffic in case if + * probing fails. + */ + if (combiner->probing_primary) + return false; + if (OidIsValid(combiner->returning_node)) + { + if (combiner->returning_node != node) + return false; + } + else + combiner->returning_node = node; + } + + /* + * We are copying message because it points into connection buffer, and + * will be overwritten on next socket read + */ + combiner->currentRow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + len); + memcpy(combiner->currentRow->msg, msg_body, len); + combiner->currentRow->msglen = len; + combiner->currentRow->msgnode = node; + + return true; +} +#else static void HandleDataRow(RemoteQueryState *combiner, char *msg_body, size_t len, int nid) { @@ -634,12 +958,17 @@ HandleDataRow(RemoteQueryState *combiner, char *msg_body, size_t len, int nid) combiner->currentRow.msglen = len; combiner->currentRow.msgnode = nid; } +#endif /* * Handle ErrorResponse ('E') message from a Datanode connection */ static void +#ifdef XCP +HandleError(ResponseCombiner *combiner, char *msg_body, size_t len) +#else HandleError(RemoteQueryState *combiner, char *msg_body, size_t len) +#endif { /* parse error message */ char *code = NULL; @@ -692,6 +1021,24 @@ HandleError(RemoteQueryState *combiner, char *msg_body, size_t len) * ReadyForQuery is received, so we just store the error message. * If multiple connections return errors only first one is reported. */ +#ifdef XCP + /* + * The producer error may be hiding primary error, so if previously received + * error is a producer error allow it to be overwritten. + */ + if (combiner->errorMessage == NULL || + MAKE_SQLSTATE(combiner->errorCode[0], combiner->errorCode[1], + combiner->errorCode[2], combiner->errorCode[3], + combiner->errorCode[4]) == ERRCODE_PRODUCER_ERROR) + { + combiner->errorMessage = pstrdup(message); + /* Error Code is exactly 5 significant bytes */ + if (code) + memcpy(combiner->errorCode, code, 5); + if (detail) + combiner->errorDetail = pstrdup(detail); + } +#else if (!combiner->errorMessage) { combiner->errorMessage = pstrdup(message); @@ -704,6 +1051,7 @@ HandleError(RemoteQueryState *combiner, char *msg_body, size_t len) { combiner->errorDetail = pstrdup(detail); } +#endif /* * If Datanode have sent ErrorResponse it will never send CommandComplete. @@ -791,8 +1139,13 @@ HandleCmdComplete(CmdType commandType, CombineTag *combine, /* * HandleDatanodeCommandId ('M') message from a Datanode connection */ +#ifdef XCP +static void +HandleDatanodeCommandId(ResponseCombiner *combiner, char *msg_body, size_t len) +#else static void HandleDatanodeCommandId(RemoteQueryState *combiner, char *msg_body, size_t len) +#endif { uint32 n32; CommandId cid; @@ -814,7 +1167,11 @@ HandleDatanodeCommandId(RemoteQueryState *combiner, char *msg_body, size_t len) * successfully */ static bool +#ifdef XCP +validate_combiner(ResponseCombiner *combiner) +#else validate_combiner(RemoteQueryState *combiner) +#endif { /* There was error message while combining */ if (combiner->errorMessage) @@ -853,6 +1210,24 @@ validate_combiner(RemoteQueryState *combiner) /* * Close combiner and free allocated memory, if it is not needed */ +#ifdef XCP +void +CloseCombiner(ResponseCombiner *combiner) +{ + if (combiner->connections) + pfree(combiner->connections); + if (combiner->tuple_desc) + FreeTupleDesc(combiner->tuple_desc); + if (combiner->errorMessage) + pfree(combiner->errorMessage); + if (combiner->cursor_connections) + pfree(combiner->cursor_connections); + if (combiner->tapenodes) + pfree(combiner->tapenodes); + if (combiner->tapemarks) + pfree(combiner->tapemarks); +} +#else static void CloseCombiner(RemoteQueryState *combiner) { @@ -881,12 +1256,17 @@ CloseCombiner(RemoteQueryState *combiner) pfree(combiner); } } +#endif /* * Validate combiner and release storage freeing allocated memory */ static bool +#ifdef XCP +ValidateAndCloseCombiner(ResponseCombiner *combiner) +#else ValidateAndCloseCombiner(RemoteQueryState *combiner) +#endif { bool valid = validate_combiner(combiner); @@ -911,6 +1291,171 @@ ValidateAndCloseCombiner(RemoteQueryState *combiner) * points to the original RemoteQueryState. If combiner differs from "this" the * connection should be buffered. */ +#ifdef XCP +void +BufferConnection(PGXCNodeHandle *conn) +{ + ResponseCombiner *combiner = conn->combiner; + MemoryContext oldcontext; + + if (combiner == NULL || conn->state != DN_CONNECTION_STATE_QUERY) + return; + + elog(LOG, "Buffer connection %u to step %s", conn->nodeoid, combiner->cursor); + + /* + * When BufferConnection is invoked CurrentContext is related to other + * portal, which is trying to control the connection. + * TODO See if we can find better context to switch to + */ + oldcontext = MemoryContextSwitchTo(combiner->ss.ps.ps_ResultTupleSlot->tts_mcxt); + + /* Verify the connection is in use by the combiner */ + combiner->current_conn = 0; + while (combiner->current_conn < combiner->conn_count) + { + if (combiner->connections[combiner->current_conn] == conn) + break; + combiner->current_conn++; + } + Assert(combiner->current_conn < combiner->conn_count); + + if (combiner->tapemarks == NULL) + combiner->tapemarks = (ListCell**) palloc0(combiner->conn_count * sizeof(ListCell*)); + + /* + * If current bookmark for the current tape is not set it means either + * first row in the buffer is from the current tape or no rows from + * the tape in the buffer, so if first row is not from current + * connection bookmark the last cell in the list. + */ + if (combiner->tapemarks[combiner->current_conn] == NULL && + list_length(combiner->rowBuffer) > 0) + { + RemoteDataRow dataRow = (RemoteDataRow) linitial(combiner->rowBuffer); + if (dataRow->msgnode != conn->nodeoid) + combiner->tapemarks[combiner->current_conn] = list_tail(combiner->rowBuffer); + } + + /* + * Buffer data rows until data node return number of rows specified by the + * fetch_size parameter of last Execute message (PortalSuspended message) + * or end of result set is reached (CommandComplete message) + */ + while (true) + { + int res; + + /* Move to buffer currentRow (received from the data node) */ + if (combiner->currentRow) + { + combiner->rowBuffer = lappend(combiner->rowBuffer, + combiner->currentRow); + combiner->currentRow = NULL; + } + + res = handle_response(conn, combiner); + /* + * If response message is a DataRow it will be handled on the next + * iteration. + * PortalSuspended will cause connection state change and break the loop + * The same is for CommandComplete, but we need additional handling - + * remove connection from the list of active connections. + * We may need to add handling error response + */ + + /* Most often result check first */ + if (res == RESPONSE_DATAROW) + { + /* + * The row is in the combiner->currentRow, on next iteration it will + * be moved to the buffer + */ + continue; + } + + /* incomplete message, read more */ + if (res == RESPONSE_EOF) + { + if (pgxc_node_receive(1, &conn, NULL)) + { + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + add_error_message(conn, "Failed to fetch from data node"); + } + } + + /* + * End of result set is reached, so either set the pointer to the + * connection to NULL (combiner with sort) or remove it from the list + * (combiner without sort) + */ + else if (res == RESPONSE_COMPLETE) + { + /* + * If combiner is doing merge sort we should set reference to the + * current connection to NULL in the array, indicating the end + * of the tape is reached. FetchTuple will try to access the buffer + * first anyway. + * Since we remove that reference we can not determine what node + * number was this connection, but we need this info to find proper + * tuple in the buffer if we are doing merge sort. So store node + * number in special array. + * NB: We can not test if combiner->tuplesortstate is set here: + * connection may require buffering inside tuplesort_begin_merge + * - while pre-read rows from the tapes, one of the tapes may be + * the local connection with RemoteSubplan in the tree. The + * combiner->tuplesortstate is set only after tuplesort_begin_merge + * returns. + */ + if (combiner->merge_sort) + { + combiner->connections[combiner->current_conn] = NULL; + if (combiner->tapenodes == NULL) + combiner->tapenodes = (Oid *) + palloc0(combiner->conn_count * sizeof(Oid)); + combiner->tapenodes[combiner->current_conn] = conn->nodeoid; + } + else + { + /* Remove current connection, move last in-place, adjust current_conn */ + if (combiner->current_conn < --combiner->conn_count) + combiner->connections[combiner->current_conn] = combiner->connections[combiner->conn_count]; + else + combiner->current_conn = 0; + } + /* + * If combiner runs Simple Query Protocol we need to read in + * ReadyForQuery. In case of Extended Query Protocol it is not + * sent and we should quit. + */ + if (combiner->extended_query) + break; + } + else if (res == RESPONSE_ERROR) + { + if (combiner->extended_query) + { + /* + * Need to sync connection to enable receiving commands + * by the datanode + */ + if (pgxc_node_send_sync(conn) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to fetch from data node"))); + } + } + else if (res == RESPONSE_SUSPENDED || res == RESPONSE_READY) + { + /* Now it is OK to quit */ + break; + } + } + Assert(conn->state != DN_CONNECTION_STATE_QUERY); + MemoryContextSwitchTo(oldcontext); + conn->combiner = NULL; +} +#else void BufferConnection(PGXCNodeHandle *conn) { @@ -982,7 +1527,7 @@ BufferConnection(PGXCNodeHandle *conn) * connection to NULL (step with sort) or remove it from the list * (step without sort) */ - if (combiner->rqs_for_sort) + if (combiner->tuplesortstate) { combiner->connections[combiner->current_conn] = NULL; if (combiner->tapenodes == NULL) @@ -1008,11 +1553,29 @@ BufferConnection(PGXCNodeHandle *conn) MemoryContextSwitchTo(oldcontext); conn->combiner = NULL; } +#endif /* * copy the datarow from combiner to the given slot, in the slot's memory * context */ +#ifdef XCP +static void +CopyDataRowTupleToSlot(ResponseCombiner *combiner, TupleTableSlot *slot) +{ + RemoteDataRow datarow; + MemoryContext oldcontext; + oldcontext = MemoryContextSwitchTo(slot->tts_mcxt); + datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + combiner->currentRow->msglen); + datarow->msgnode = combiner->currentRow->msgnode; + datarow->msglen = combiner->currentRow->msglen; + memcpy(datarow->msg, combiner->currentRow->msg, datarow->msglen); + ExecStoreDataRowTuple(datarow, slot, true); + pfree(combiner->currentRow); + combiner->currentRow = NULL; + MemoryContextSwitchTo(oldcontext); +} +#else static void CopyDataRowTupleToSlot(RemoteQueryState *combiner, TupleTableSlot *slot) { @@ -1028,7 +1591,349 @@ CopyDataRowTupleToSlot(RemoteQueryState *combiner, TupleTableSlot *slot) combiner->currentRow.msgnode = 0; MemoryContextSwitchTo(oldcontext); } +#endif + + +#ifdef XCP +/* + * FetchTuple + * + Get next tuple from one of the datanode connections. + * The connections should be in combiner->connections, if "local" dummy + * connection presents it should be the last active connection in the array. + * If combiner is set up to perform merge sort function returns tuple from + * connection defined by combiner->current_conn, or NULL slot if no more tuple + * are available from the connection. Otherwise it returns tuple from any + * connection or NULL slot if no more available connections. + * Function looks into combiner->rowBuffer before accessing connection + * and return a tuple from there if found. + * Function may wait while more data arrive from the data nodes. If there + * is a locally executed subplan function advance it and buffer resulting rows + * instead of waiting. + */ +TupleTableSlot * +FetchTuple(ResponseCombiner *combiner) +{ + PGXCNodeHandle *conn; + TupleTableSlot *slot; + Oid nodeOid = -1; + + /* + * Case if we run local subplan. + * We do not have remote connections, so just get local tuple and return it + */ + if (outerPlanState(combiner)) + { + RemoteSubplanState *planstate = (RemoteSubplanState *) combiner; + RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan; + /* Advance subplan in a loop until we have something to return */ + for (;;) + { + Datum value = NULL; + bool isnull; + int numnodes; + int i; + + slot = ExecProcNode(outerPlanState(combiner)); + /* If locator is not defined deliver all the results */ + if (planstate->locator == NULL) + return slot; + + /* + * If NULL tuple is returned we done with the subplan, finish it up and + * return NULL + */ + if (TupIsNull(slot)) + return NULL; + + /* Get partitioning value if defined */ + if (plan->distributionKey != InvalidAttrNumber) + value = slot_getattr(slot, plan->distributionKey, &isnull); + + /* Determine target nodes */ + numnodes = GET_NODES(planstate->locator, value, isnull, NULL); + for (i = 0; i < numnodes; i++) + { + /* Deliver the node */ + if (planstate->dest_nodes[i] == PGXCNodeId-1) + return slot; + } + } + } + + /* + * Get current connection + */ + if (combiner->conn_count > combiner->current_conn) + conn = combiner->connections[combiner->current_conn]; + else + conn = NULL; + + /* + * If doing merge sort determine the node number. + * It may be needed to get buffered row. + */ + if (combiner->merge_sort) + { + Assert(conn || combiner->tapenodes); + nodeOid = conn ? conn->nodeoid : + combiner->tapenodes[combiner->current_conn]; + Assert(OidIsValid(nodeOid)); + } + /* + * First look into the row buffer. + * When we are performing merge sort we need to get from the buffer record + * from the connection marked as "current". Otherwise get first. + */ + if (list_length(combiner->rowBuffer) > 0) + { + RemoteDataRow dataRow; + + Assert(combiner->currentRow == NULL); + + if (combiner->merge_sort) + { + ListCell *lc; + ListCell *prev; + + elog(LOG, "Getting buffered tuple from node %x", nodeOid); + + prev = combiner->tapemarks[combiner->current_conn]; + if (prev) + { + /* + * Start looking through the list from the bookmark. + * Probably the first cell we check contains row from the needed + * node. Otherwise continue scanning until we encounter one, + * advancing prev pointer as well. + */ + while((lc = lnext(prev)) != NULL) + { + dataRow = (RemoteDataRow) lfirst(lc); + if (dataRow->msgnode == nodeOid) + { + combiner->currentRow = dataRow; + break; + } + prev = lc; + } + } + else + { + /* + * Either needed row is the first in the buffer or no such row + */ + lc = list_head(combiner->rowBuffer); + dataRow = (RemoteDataRow) lfirst(lc); + if (dataRow->msgnode == nodeOid) + combiner->currentRow = dataRow; + else + lc = NULL; + } + if (lc) + { + /* + * Delete cell from the buffer. Before we delete we must check + * the bookmarks, if the cell is a bookmark for any tape. + * If it is the case we are deleting last row of the current + * block from the current tape. That tape should have bookmark + * like current, and current bookmark will be advanced when we + * read the tape once again. + */ + int i; + for (i = 0; i < combiner->conn_count; i++) + { + if (combiner->tapemarks[i] == lc) + combiner->tapemarks[i] = prev; + } + elog(LOG, "Found buffered tuple from node %x", nodeOid); + combiner->rowBuffer = list_delete_cell(combiner->rowBuffer, + lc, prev); + } + elog(LOG, "Update tapemark"); + combiner->tapemarks[combiner->current_conn] = prev; + } + else + { + dataRow = (RemoteDataRow) linitial(combiner->rowBuffer); + combiner->currentRow = dataRow; + combiner->rowBuffer = list_delete_first(combiner->rowBuffer); + } + } + + /* If we have node message in the currentRow slot, and it is from a proper + * node, consume it. */ + if (combiner->currentRow) + { + Assert(!combiner->merge_sort || + combiner->currentRow->msgnode == nodeOid); + slot = combiner->ss.ps.ps_ResultTupleSlot; + CopyDataRowTupleToSlot(combiner, slot); + return slot; + } + + while (conn) + { + int res; + + /* Going to use a connection, buffer it if needed */ + CHECK_OWNERSHIP(conn, combiner); + + /* + * If current connection is idle it means portal on the data node is + * suspended. Request more and try to get it + */ + if (combiner->extended_query && + conn->state == DN_CONNECTION_STATE_IDLE) + { + /* + * We do not allow to suspend if querying primary node, so that + * only may mean the current node is secondary and subplan was not + * executed there yet. Return and go on with second phase. + */ + if (combiner->probing_primary) + return NULL; + if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to fetch from data node"))); + if (pgxc_node_send_flush(conn) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to fetch from data node"))); + if (pgxc_node_receive(1, &conn, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to fetch from data node"))); + } + + /* read messages */ + res = handle_response(conn, combiner); + if (res == RESPONSE_DATAROW) + { + slot = combiner->ss.ps.ps_ResultTupleSlot; + CopyDataRowTupleToSlot(combiner, slot); + return slot; + } + else if (res == RESPONSE_EOF) + { + /* incomplete message, read more */ + if (pgxc_node_receive(1, &conn, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to fetch from data node"))); + continue; + } + else if (res == RESPONSE_SUSPENDED) + { + /* + * If we are doing merge sort or probing primary node we should + * remain on the same node, so query next portion immediately. + * Otherwise leave node suspended and fetch lazily. + */ + if (combiner->merge_sort || combiner->probing_primary) + { + if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to fetch from data node"))); + if (pgxc_node_send_flush(conn) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to fetch from data node"))); + if (pgxc_node_receive(1, &conn, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to fetch from data node"))); + continue; + } + if (++combiner->current_conn >= combiner->conn_count) + combiner->current_conn = 0; + conn = combiner->connections[combiner->current_conn]; + } + else if (res == RESPONSE_COMPLETE) + { + /* + * In case of Simple Query Protocol we should receive ReadyForQuery + * before removing connection from the list. In case of Extended + * Query Protocol we may remove connection right away. + */ + if (combiner->extended_query) + { + /* If we are doing merge sort clean current connection and return + * NULL, otherwise remove current connection, move last in-place, + * adjust current_conn and continue if it is not last connection */ + if (combiner->merge_sort) + { + combiner->connections[combiner->current_conn] = NULL; + return NULL; + } + REMOVE_CURR_CONN(combiner); + if (combiner->conn_count > 0) + conn = combiner->connections[combiner->current_conn]; + else + return NULL; + } + } + else if (res == RESPONSE_ERROR) + { + /* + * If doing Extended Query Protocol we need to sync connection, + * otherwise subsequent commands will be ignored. + */ + if (combiner->extended_query) + { + if (pgxc_node_send_sync(conn) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to fetch from data node"))); + } + /* + * Do not wait for response from primary, it needs to wait + * for other nodes to respond. Instead go ahead and send query to + * other nodes. It will fail there, but we can continue with + * normal cleanup. + */ + if (combiner->probing_primary) + { + REMOVE_CURR_CONN(combiner); + return NULL; + } + } + else if (res == RESPONSE_READY) + { + /* If we are doing merge sort clean current connection and return + * NULL, otherwise remove current connection, move last in-place, + * adjust current_conn and continue if it is not last connection */ + if (combiner->merge_sort) + { + combiner->connections[combiner->current_conn] = NULL; + return NULL; + } + REMOVE_CURR_CONN(combiner); + if (combiner->conn_count > 0) + conn = combiner->connections[combiner->current_conn]; + else + return NULL; + } + else if (res == RESPONSE_TUPDESC) + { + ExecSetSlotDescriptor(combiner->ss.ps.ps_ResultTupleSlot, + combiner->tuple_desc); + /* Now slot is responsible for freeng the descriptor */ + combiner->tuple_desc = NULL; + } + else + { + // Can not get here? + Assert(false); + } + } + + return NULL; +} +#else /* * Get next data row from the combiner's buffer into provided slot * Just clear slot and return false if buffer is empty, that means end of result @@ -1039,12 +1944,6 @@ FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot) { bool have_tuple = false; - /* - * We don't expect the RemoteQuery feeding a sort to come this way. As of - * now, such a RemoteQuery gets the rows as dictated by the Sort plan above, - * hence fetches the rows on its own. - */ - Assert(!combiner->rqs_for_sort); /* If we have message in the buffer, consume it */ if (combiner->currentRow.msg) { @@ -1053,6 +1952,15 @@ FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot) } /* + * If this is ordered fetch we can not know what is the node + * to handle next, so sorter will choose next itself and set it as + * currentRow to have it consumed on the next call to FetchTuple. + * Otherwise allow to prefetch next tuple + */ + if (((RemoteQuery *)combiner->ss.ps.plan)->sort) + return have_tuple; + + /* * Note: If we are fetching not sorted results we can not have both * currentRow and buffered rows. When connection is buffered currentRow * is moved to buffer, and then it is cleaned after buffering is @@ -1155,6 +2063,15 @@ FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot) CopyDataRowTupleToSlot(combiner, slot); have_tuple = true; } + + /* + * If this is ordered fetch we can not know what is the node + * to handle next, so sorter will choose next itself and set it as + * currentRow to have it consumed on the next call to FetchTuple. + * Otherwise allow to prefetch next tuple + */ + if (((RemoteQuery *)combiner->ss.ps.plan)->sort) + return have_tuple; } /* report end of data to the caller */ @@ -1163,14 +2080,20 @@ FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot) return have_tuple; } +#endif /* * Handle responses from the Datanode connections */ static int +#ifdef XCP +pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections, + struct timeval * timeout, ResponseCombiner *combiner) +#else pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections, struct timeval * timeout, RemoteQueryState *combiner) +#endif { int count = conn_count; PGXCNodeHandle *to_receive[conn_count]; @@ -1199,6 +2122,14 @@ pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections, i++; break; case RESPONSE_COMPLETE: +#ifdef XCP + if (to_receive[i]->state != DN_CONNECTION_STATE_ERROR_FATAL) + /* Continue read until ReadyForQuery */ + break; + /* fallthru */ + case RESPONSE_READY: + /* fallthru */ +#endif case RESPONSE_COPY: /* Handling is done, do not track this connection */ count--; @@ -1206,6 +2137,11 @@ pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections, if (i < count) to_receive[i] = to_receive[count]; break; +#ifdef XCP + case RESPONSE_ERROR: + /* no handling needed, just wait for ReadyForQuery */ + break; +#endif default: /* Inconsistent responses */ add_error_message(to_receive[i], "Unexpected response from the Datanodes"); @@ -1217,11 +2153,169 @@ pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections, } } } +#ifndef XCP pgxc_node_report_error(combiner); +#endif return 0; } +#ifdef XCP +/* + * Read next message from the connection and update the combiner + * and connection state accordingly + * If we are in an error state we just consume the messages, and do not proxy + * Long term, we should look into cancelling executing statements + * and closing the connections. + * It returns if states need to be handled + * Return values: + * RESPONSE_EOF - need to receive more data for the connection + * RESPONSE_READY - got ReadyForQuery + * RESPONSE_COMPLETE - done with the connection, but not yet ready for query. + * Also this result is output in case of error + * RESPONSE_SUSPENDED - got PortalSuspended + * RESPONSE_TUPLEDESC - got tuple description + * RESPONSE_DATAROW - got data row + * RESPONSE_COPY - got copy response + * RESPONSE_BARRIER_OK - barrier command completed successfully + */ +int +handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner) +{ + char *msg; + int msg_len; + char msg_type; + + for (;;) + { + /* + * If we are in the process of shutting down, we + * may be rolling back, and the buffer may contain other messages. + * We want to avoid a procarray exception + * as well as an error stack overflow. + */ + if (proc_exit_inprogress) + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + + /* + * Don't read from from the connection if there is a fatal error. + * We still return RESPONSE_COMPLETE, not RESPONSE_ERROR, since + * Handling of RESPONSE_ERROR assumes sending SYNC message, but + * State DN_CONNECTION_STATE_ERROR_FATAL indicates connection is + * not usable. + */ + if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) + return RESPONSE_COMPLETE; + + /* No data available, exit */ + if (!HAS_MESSAGE_BUFFERED(conn)) + return RESPONSE_EOF; + + Assert(conn->combiner == combiner || conn->combiner == NULL); + + /* TODO handle other possible responses */ + msg_type = get_message(conn, &msg_len, &msg); + switch (msg_type) + { + case '\0': /* Not enough data in the buffer */ + return RESPONSE_EOF; + case 'c': /* CopyToCommandComplete */ + HandleCopyOutComplete(combiner); + break; + case 'C': /* CommandComplete */ + HandleCommandComplete(combiner, msg, msg_len, conn); + conn->combiner = NULL; + if (conn->state == DN_CONNECTION_STATE_QUERY) + conn->state = DN_CONNECTION_STATE_IDLE; + return RESPONSE_COMPLETE; + case 'T': /* RowDescription */ +#ifdef DN_CONNECTION_DEBUG + Assert(!conn->have_row_desc); + conn->have_row_desc = true; +#endif + if (HandleRowDescription(combiner, msg, msg_len)) + return RESPONSE_TUPDESC; + break; + case 'D': /* DataRow */ +#ifdef DN_CONNECTION_DEBUG + Assert(conn->have_row_desc); +#endif + /* Do not return if data row has not been actually handled */ + if (HandleDataRow(combiner, msg, msg_len, conn->nodeoid)) + return RESPONSE_DATAROW; + break; + case 's': /* PortalSuspended */ + /* No activity is expected on the connection until next query */ + conn->state = DN_CONNECTION_STATE_IDLE; + conn->combiner = NULL; + return RESPONSE_SUSPENDED; + case '1': /* ParseComplete */ + case '2': /* BindComplete */ + case '3': /* CloseComplete */ + case 'n': /* NoData */ + /* simple notifications, continue reading */ + break; + case 'G': /* CopyInResponse */ + conn->state = DN_CONNECTION_STATE_COPY_IN; + HandleCopyIn(combiner); + /* Done, return to caller to let it know the data can be passed in */ + return RESPONSE_COPY; + case 'H': /* CopyOutResponse */ + conn->state = DN_CONNECTION_STATE_COPY_OUT; + HandleCopyOut(combiner); + return RESPONSE_COPY; + case 'd': /* CopyOutDataRow */ + conn->state = DN_CONNECTION_STATE_COPY_OUT; + HandleCopyDataRow(combiner, msg, msg_len); + break; + case 'E': /* ErrorResponse */ + HandleError(combiner, msg, msg_len); + add_error_message(conn, combiner->errorMessage); + return RESPONSE_ERROR; + case 'A': /* NotificationResponse */ + case 'N': /* NoticeResponse */ + case 'S': /* SetCommandComplete */ + /* + * Ignore these to prevent multiple messages, one from each + * node. Coordinator will send one for DDL anyway + */ + break; + case 'Z': /* ReadyForQuery */ + { + /* + * Return result depends on previous connection state. + * If it was PORTAL_SUSPENDED Coordinator want to send down + * another EXECUTE to fetch more rows, otherwise it is done + * with the connection + */ + conn->transaction_status = msg[0]; + conn->state = DN_CONNECTION_STATE_IDLE; + conn->combiner = NULL; +#ifdef DN_CONNECTION_DEBUG + conn->have_row_desc = false; +#endif + return RESPONSE_READY; + } + case 'M': /* Command Id */ + HandleDatanodeCommandId(combiner, msg, msg_len); + break; + case 'b': + conn->state = DN_CONNECTION_STATE_IDLE; + return RESPONSE_BARRIER_OK; + case 'I': /* EmptyQuery */ + return RESPONSE_COMPLETE; + default: + /* sync lost? */ + elog(WARNING, "Received unsupported message type: %c", msg_type); + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + /* stop reading */ + return RESPONSE_COMPLETE; + } + } + /* never happen, but keep compiler quiet */ + return RESPONSE_EOF; +} +#else /* * Read next message from the connection and update the combiner accordingly * If we are in an error state we just consume the messages, and do not proxy @@ -1241,7 +2335,6 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) char *msg; int msg_len; char msg_type; - bool suspended = false; for (;;) { @@ -1276,7 +2369,7 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) HandleCopyOutComplete(combiner); break; case 'C': /* CommandComplete */ - HandleCommandComplete(combiner, msg, msg_len, conn); + HandleCommandComplete(combiner, msg, msg_len); break; case 'T': /* RowDescription */ #ifdef DN_CONNECTION_DEBUG @@ -1335,7 +2428,7 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) { /* * Return result depends on previous connection state. - * If it was PORTAL_SUSPENDED Coordinator want to send down + * If it was PORTAL_SUSPENDED coordinator want to send down * another EXECUTE to fetch more rows, otherwise it is done * with the connection */ @@ -1352,8 +2445,10 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) HandleDatanodeCommandId(combiner, msg, msg_len); break; case 'b': - conn->state = DN_CONNECTION_STATE_IDLE; - return RESPONSE_BARRIER_OK; + { + conn->state = DN_CONNECTION_STATE_IDLE; + return RESPONSE_BARRIER_OK; + } case 'I': /* EmptyQuery */ default: /* sync lost? */ @@ -1366,10 +2461,11 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) /* never happen, but keep compiler quiet */ return RESPONSE_EOF; } +#endif /* - * Has the Datanode sent Ready For Query + * Has the data node sent Ready For Query */ bool @@ -1378,6 +2474,7 @@ is_data_node_ready(PGXCNodeHandle * conn) char *msg; int msg_len; char msg_type; + bool suspended = false; for (;;) { @@ -1402,6 +2499,7 @@ is_data_node_ready(PGXCNodeHandle * conn) switch (msg_type) { case 's': /* PortalSuspended */ + suspended = true; break; case 'Z': /* ReadyForQuery */ @@ -1421,6 +2519,8 @@ is_data_node_ready(PGXCNodeHandle * conn) return false; } + +#ifndef XCP /* * Construct a BEGIN TRANSACTION command after taking into account the * current options. The returned string is not palloced and is valid only until @@ -1452,6 +2552,7 @@ generate_begin_command(void) return begin_cmd; } +#endif /* * Send BEGIN command to the Datanodes or Coordinators and receive responses. @@ -1464,12 +2565,20 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, { int i; struct timeval *timeout = NULL; +#ifdef XCP + ResponseCombiner combiner; +#else RemoteQueryState *combiner; +#endif TimestampTz timestamp = GetCurrentGTMStartTimestamp(); PGXCNodeHandle *new_connections[conn_count]; int new_count = 0; +#ifdef XCP + char *init_str; +#else int con[conn_count]; int j = 0; +#endif /* * If no remote connections, we don't have anything to do @@ -1479,6 +2588,12 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, for (i = 0; i < conn_count; i++) { +#ifdef XCP + if (!readOnly && !IsConnFromDatanode()) + { + connections[i]->read_only = false; + } +#else /* * If the node is already a participant in the transaction, skip it */ @@ -1493,7 +2608,7 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, RegisterTransactionNodes(1, (void **)&connections[i], true); continue; } - +#endif /* * PGXC TODO - A connection should not be in DN_CONNECTION_STATE_QUERY * state when we are about to send a BEGIN TRANSACTION command to the @@ -1511,13 +2626,26 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, if (GlobalTimestampIsValid(timestamp) && pgxc_node_send_timestamp(connections[i], timestamp)) return EOF; +#ifdef XCP + if (IS_PGXC_DATANODE && GlobalTransactionIdIsValid(gxid)) + need_tran_block = true; + /* Send BEGIN if not already in transaction */ + if (need_tran_block && connections[i]->transaction_status == 'I') +#else /* Send BEGIN */ if (need_tran_block) +#endif { /* Send the BEGIN TRANSACTION command and check for errors */ +#ifdef XCP + if (pgxc_node_send_query(connections[i], "BEGIN")) + return EOF; +#else if (pgxc_node_send_query(connections[i], generate_begin_command())) return EOF; +#endif +#ifndef XCP con[j++] = PGXCNodeGetNodeId(connections[i]->nodeoid, node_type); /* * Register the node as a participant in the transaction. The @@ -1531,6 +2659,7 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, * read-write statement. */ RegisterTransactionNodes(1, (void **)&connections[i], !readOnly); +#endif new_connections[new_count++] = connections[i]; } } @@ -1542,6 +2671,31 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, if (new_count == 0) return 0; +#ifdef XCP + InitResponseCombiner(&combiner, new_count, COMBINE_TYPE_NONE); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + + /* Receive responses */ + if (pgxc_node_receive_responses(new_count, new_connections, timeout, &combiner)) + return EOF; + + /* Verify status */ + if (!ValidateAndCloseCombiner(&combiner)) + return EOF; + + /* after transactions are started send down local set commands */ + init_str = PGXCNodeGetTransactionParamStr(); + if (init_str) + { + for (i = 0; i < new_count; i++) + { + pgxc_node_set_query(new_connections[i], init_str); + } + } +#else combiner = CreateResponseCombiner(new_count, COMBINE_TYPE_NONE); /* Receive responses */ @@ -1568,11 +2722,701 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections, if (res != 0) return EOF; } +#endif /* No problem, let's get going */ return 0; } + +#ifdef XCP +/* + * Execute DISCARD ALL command on all allocated nodes to remove all session + * specific stuff before releasing them to pool for reuse by other sessions. + */ +static void +pgxc_node_remote_cleanup_all(void) +{ + PGXCNodeAllHandles *handles = get_current_handles(); + PGXCNodeHandle *new_connections[handles->co_conn_count + handles->dn_conn_count]; + int new_conn_count = 0; + int i; + char *resetcmd = "RESET ALL;RESET SESSION AUTHORIZATION;" + "RESET transaction_isolation;"; + + /* + * We must handle reader and writer connections both since even a read-only + * needs to be cleaned up. + */ + if (handles->co_conn_count + handles->dn_conn_count == 0) + return; + + /* + * Send down snapshot followed by DISCARD ALL command. + */ + for (i = 0; i < handles->co_conn_count; i++) + { + PGXCNodeHandle *handle = handles->coord_handles[i]; + + /* At this point connection should be in IDLE state */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + { + handle->state = DN_CONNECTION_STATE_ERROR_FATAL; + continue; + } + + /* + * We must go ahead and release connections anyway, so do not throw + * an error if we have a problem here. + */ + if (pgxc_node_send_query(handle, resetcmd)) + { + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to clean up data nodes"))); + handle->state = DN_CONNECTION_STATE_ERROR_FATAL; + continue; + } + new_connections[new_conn_count++] = handle; + } + for (i = 0; i < handles->dn_conn_count; i++) + { + PGXCNodeHandle *handle = handles->datanode_handles[i]; + + /* At this point connection should be in IDLE state */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + { + handle->state = DN_CONNECTION_STATE_ERROR_FATAL; + continue; + } + + /* + * We must go ahead and release connections anyway, so do not throw + * an error if we have a problem here. + */ + if (pgxc_node_send_query(handle, resetcmd)) + { + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to clean up data nodes"))); + handle->state = DN_CONNECTION_STATE_ERROR_FATAL; + continue; + } + new_connections[new_conn_count++] = handle; + } + + if (new_conn_count) + { + ResponseCombiner combiner; + InitResponseCombiner(&combiner, new_conn_count, COMBINE_TYPE_NONE); + /* Receive responses */ + pgxc_node_receive_responses(new_conn_count, new_connections, NULL, &combiner); + CloseCombiner(&combiner); + } +} + + +/* + * Prepare nodes which ran write operations during the transaction. + * Read only remote transactions are committed and connections are released + * back to the pool. + * Function returns the list of nodes where transaction is prepared, including + * local node, if requested, in format expected by the GTM server. + * If something went wrong the function tries to abort prepared transactions on + * the nodes where it succeeded and throws error. A warning is emitted if abort + * prepared fails. + * After completion remote connection handles are released. + */ +static char * +pgxc_node_remote_prepare(char *prepareGID, bool localNode) +{ + bool isOK = true; + StringInfoData nodestr; + char prepare_cmd[256]; + char abort_cmd[256]; + GlobalTransactionId auxXid; + char *commit_cmd = "COMMIT TRANSACTION"; + int i; + ResponseCombiner combiner; + PGXCNodeHandle *connections[MaxDataNodes + MaxCoords]; + int conn_count = 0; + PGXCNodeAllHandles *handles = get_current_handles(); + + initStringInfo(&nodestr); + if (localNode) + appendStringInfoString(&nodestr, PGXCNodeName); + + sprintf(prepare_cmd, "PREPARE TRANSACTION '%s'", prepareGID); + + for (i = 0; i < handles->dn_conn_count; i++) + { + PGXCNodeHandle *conn = handles->datanode_handles[i]; + + /* + * If something went wrong already we have nothing to do here. The error + * will be reported at the end of the function, and we will rollback + * remotes as part of the error handling. + * Just skip to clean up section and check if we have already prepared + * somewhere, we should abort that prepared transaction. + */ + if (!isOK) + goto prepare_err; + + /* + * Skip empty slots + */ + if (conn->sock == NO_SOCKET) + continue; + else if (conn->transaction_status == 'T') + { + /* Read in any pending input */ + if (conn->state != DN_CONNECTION_STATE_IDLE) + BufferConnection(conn); + + if (conn->read_only) + { + /* Send down prepare command */ + if (pgxc_node_send_query(conn, commit_cmd)) + { + /* + * not a big deal, it was read only, the connection will be + * abandoned later. + */ + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send COMMIT command to " + "the node %u", conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + else + { + /* Send down prepare command */ + if (pgxc_node_send_query(conn, prepare_cmd)) + { + /* + * That is the trouble, we really want to prepare it. + * Just emit warning so far and go to clean up. + */ + isOK = false; + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send PREPARE TRANSACTION command to " + "the node %u", conn->nodeoid))); + } + else + { + char *nodename = get_pgxc_nodename(conn->nodeoid); + if (nodestr.len > 0) + appendStringInfoChar(&nodestr, ','); + appendStringInfoString(&nodestr, nodename); + /* Read responses from these */ + connections[conn_count++] = conn; + /* + * If it fails on remote node it would just return ROLLBACK. + * Set the flag for the message handler so the response is + * verified. + */ + conn->ck_resp_rollback = true; + } + } + } + else if (conn->transaction_status == 'E') + { + /* + * Probably can not happen, if there was a error the engine would + * abort anyway, even in case of explicit PREPARE. + * Anyway, just in case... + */ + isOK = false; + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("remote node %u is in error state", conn->nodeoid))); + } + } + + for (i = 0; i < handles->co_conn_count; i++) + { + PGXCNodeHandle *conn = handles->coord_handles[i]; + + /* + * If something went wrong already we have nothing to do here. The error + * will be reported at the end of the function, and we will rollback + * remotes as part of the error handling. + * Just skip to clean up section and check if we have already prepared + * somewhere, we should abort that prepared transaction. + */ + if (!isOK) + goto prepare_err; + + /* + * Skip empty slots + */ + if (conn->sock == NO_SOCKET) + continue; + else if (conn->transaction_status == 'T') + { + if (conn->read_only) + { + /* Send down prepare command */ + if (pgxc_node_send_query(conn, commit_cmd)) + { + /* + * not a big deal, it was read only, the connection will be + * abandoned later. + */ + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send COMMIT command to " + "the node %u", conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + else + { + /* Send down prepare command */ + if (pgxc_node_send_query(conn, prepare_cmd)) + { + /* + * That is the trouble, we really want to prepare it. + * Just emit warning so far and go to clean up. + */ + isOK = false; + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send PREPARE TRANSACTION command to " + "the node %u", conn->nodeoid))); + } + else + { + char *nodename = get_pgxc_nodename(conn->nodeoid); + if (nodestr.len > 0) + appendStringInfoChar(&nodestr, ','); + appendStringInfoString(&nodestr, nodename); + /* Read responses from these */ + connections[conn_count++] = conn; + /* + * If it fails on remote node it would just return ROLLBACK. + * Set the flag for the message handler so the response is + * verified. + */ + conn->ck_resp_rollback = true; + } + } + } + else if (conn->transaction_status == 'E') + { + /* + * Probably can not happen, if there was a error the engine would + * abort anyway, even in case of explicit PREPARE. + * Anyway, just in case... + */ + isOK = false; + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("remote node %u is in error state", conn->nodeoid))); + } + } + + SetSendCommandId(false); + + if (!isOK) + goto prepare_err; + /* exit if nothing has been prepared */ + if (conn_count > 0) + { + int result; + /* + * Receive and check for any errors. In case of errors, we don't bail out + * just yet. We first go through the list of connections and look for + * errors on each connection. This is important to ensure that we run + * an appropriate ROLLBACK command later on (prepared transactions must be + * rolled back with ROLLBACK PREPARED commands). + * + * PGXCTODO - There doesn't seem to be a solid mechanism to track errors on + * individual connections. The transaction_status field doesn't get set + * every time there is an error on the connection. The combiner mechanism is + * good for parallel proessing, but I think we should have a leak-proof + * mechanism to track connection status + */ + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + /* Receive responses */ + result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner); + if (result || !validate_combiner(&combiner)) + goto prepare_err; + else + CloseCombiner(&combiner); + + /* Before exit clean the flag, to avoid unnecessary checks */ + for (i = 0; i < conn_count; i++) + connections[i]->ck_resp_rollback = false; + + pfree_pgxc_all_handles(handles); + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(); + release_handles(); + } + } + + return nodestr.data; +prepare_err: + sprintf(abort_cmd, "ROLLBACK PREPARED '%s'", prepareGID); + + auxXid = GetAuxilliaryTransactionId(); + conn_count = 0; + for (i = 0; i < handles->dn_conn_count; i++) + { + PGXCNodeHandle *conn = handles->datanode_handles[i]; + + /* + * PREPARE succeeded on that node, roll it back there + */ + if (conn->ck_resp_rollback) + { + conn->ck_resp_rollback = false; + /* sanity checks */ + Assert(conn->sock != NO_SOCKET); + Assert(conn->transaction_status == 'I'); + Assert(conn->state == DN_CONNECTION_STATE_IDLE); + /* Send down abort prepared command */ + if (pgxc_node_send_gxid(conn, auxXid)) + { + /* + * Prepared transaction is left on the node, but we can not + * do anything with that except warn the user. + */ + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send xid to " + "the node %u", conn->nodeoid))); + } + if (pgxc_node_send_query(conn, abort_cmd)) + { + /* + * Prepared transaction is left on the node, but we can not + * do anything with that except warn the user. + */ + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send ABORT PREPARED command to " + "the node %u", conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + } + for (i = 0; i < handles->co_conn_count; i++) + { + PGXCNodeHandle *conn = handles->coord_handles[i]; + + if (conn->ck_resp_rollback) + { + conn->ck_resp_rollback = false; + /* sanity checks */ + Assert(conn->sock != NO_SOCKET); + Assert(conn->transaction_status = 'I'); + Assert(conn->state = DN_CONNECTION_STATE_IDLE); + /* Send down abort prepared command */ + if (pgxc_node_send_gxid(conn, auxXid)) + { + /* + * Prepared transaction is left on the node, but we can not + * do anything with that except warn the user. + */ + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send xid to " + "the node %u", conn->nodeoid))); + } + if (pgxc_node_send_query(conn, abort_cmd)) + { + /* + * Prepared transaction is left on the node, but we can not + * do anything with that except warn the user. + */ + ereport(WARNING, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send ABORT PREPARED command to " + "the node %u", conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + } + if (conn_count > 0) + { + /* Just read out responses, throw error from the first combiner */ + ResponseCombiner combiner2; + InitResponseCombiner(&combiner2, conn_count, COMBINE_TYPE_NONE); + /* Receive responses */ + pgxc_node_receive_responses(conn_count, connections, NULL, &combiner2); + CloseCombiner(&combiner2); + } + /* + * If the flag is set we are here because combiner carries error message + */ + if (isOK) + pgxc_node_report_error(&combiner); + else + elog(ERROR, "failed to PREPARE transaction on one or more nodes"); + return NULL; +} + + +/* + * Commit transactions on remote nodes. + * If barrier lock is set wait while it is released. + * Release remote connection after completion. + */ +static void +pgxc_node_remote_commit(void) +{ + int result = 0; + char *commitCmd = "COMMIT TRANSACTION"; + int i; + ResponseCombiner combiner; + PGXCNodeHandle *connections[MaxDataNodes + MaxCoords]; + int conn_count = 0; + PGXCNodeAllHandles *handles = get_current_handles(); + + SetSendCommandId(false); + + /* + * Barrier: + * + * We should acquire the BarrierLock in SHARE mode here to ensure that + * there are no in-progress barrier at this point. This mechanism would + * work as long as LWLock mechanism does not starve a EXCLUSIVE lock + * requester + */ + LWLockAcquire(BarrierLock, LW_SHARED); + + for (i = 0; i < handles->dn_conn_count; i++) + { + PGXCNodeHandle *conn = handles->datanode_handles[i]; + + /* Skip empty slots */ + if (conn->sock == NO_SOCKET) + continue; + + /* + * We do not need to commit remote node if it is not in transaction. + * If transaction is in error state the commit command will cause + * rollback, that is OK + */ + if (conn->transaction_status != 'I') + { + /* Read in any pending input */ + if (conn->state != DN_CONNECTION_STATE_IDLE) + BufferConnection(conn); + + if (pgxc_node_send_query(conn, commitCmd)) + { + /* + * Do not bother with clean up, just bomb out. The error handler + * will invoke RollbackTransaction which will do the work. + */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send COMMIT command to the node %u", + conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + } + + for (i = 0; i < handles->co_conn_count; i++) + { + PGXCNodeHandle *conn = handles->coord_handles[i]; + + /* Skip empty slots */ + if (conn->sock == NO_SOCKET) + continue; + + /* + * We do not need to commit remote node if it is not in transaction. + * If transaction is in error state the commit command will cause + * rollback, that is OK + */ + if (conn->transaction_status != 'I') + { + if (pgxc_node_send_query(conn, commitCmd)) + { + /* + * Do not bother with clean up, just bomb out. The error handler + * will invoke RollbackTransaction which will do the work. + */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send COMMIT command to the node %u", + conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + } + + /* + * Release the BarrierLock. + */ + LWLockRelease(BarrierLock); + + if (conn_count) + { + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + /* Receive responses */ + result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner); + if (result || !validate_combiner(&combiner)) + result = EOF; + else + CloseCombiner(&combiner); + } + + stat_transaction(conn_count); + + if (result) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to COMMIT the transaction on one or more nodes"))); + } + + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(); + release_handles(); + } +} + + +/* + * Rollback transactions on remote nodes. + * Release remote connection after completion. + */ +static void +pgxc_node_remote_abort(void) +{ + int result = 0; + char *rollbackCmd = "ROLLBACK TRANSACTION"; + int i; + ResponseCombiner combiner; + PGXCNodeHandle *connections[MaxDataNodes + MaxCoords]; + int conn_count = 0; + PGXCNodeAllHandles *handles = get_current_handles(); + + SetSendCommandId(false); + + for (i = 0; i < handles->dn_conn_count; i++) + { + PGXCNodeHandle *conn = handles->datanode_handles[i]; + + /* Skip empty slots */ + if (conn->sock == NO_SOCKET) + continue; + + if (conn->transaction_status != 'I') + { + /* Read in any pending input */ + if (conn->state != DN_CONNECTION_STATE_IDLE) + BufferConnection(conn); + + /* + * Do not matter, is there committed or failed transaction, + * just send down rollback to finish it. + */ + if (pgxc_node_send_query(conn, rollbackCmd)) + { + add_error_message(conn, + "failed to send ROLLBACK TRANSACTION command"); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + } + + for (i = 0; i < handles->co_conn_count; i++) + { + PGXCNodeHandle *conn = handles->coord_handles[i]; + + /* Skip empty slots */ + if (conn->sock == NO_SOCKET) + continue; + + if (conn->transaction_status != 'I') + { + /* + * Do not matter, is there committed or failed transaction, + * just send down rollback to finish it. + */ + if (pgxc_node_send_query(conn, rollbackCmd)) + { + add_error_message(conn, + "failed to send ROLLBACK TRANSACTION command"); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + } + + if (conn_count) + { + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + /* Receive responses */ + result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner); + if (result || !validate_combiner(&combiner)) + result = EOF; + else + CloseCombiner(&combiner); + } + + stat_transaction(conn_count); + + if (result) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to ROLLBACK the transaction on one or more nodes"))); + } +} + +#else + /* * Prepare all remote nodes involved in this transaction. The local node is * handled separately and prepared first in xact.c. If there is any error @@ -1922,6 +3766,8 @@ pgxc_node_remote_commit(void) } } + stat_transaction(write_conn_count + read_conn_count); + if (result) { if (combiner) @@ -2079,11 +3925,133 @@ pgxc_node_remote_abort(void) return; } +#endif + /* * Begin COPY command * The copy_connections array must have room for NumDataNodes items */ +#ifdef XCP +void +DataNodeCopyBegin(RemoteCopyData *rcstate) +{ + int i; + List *nodelist = rcstate->rel_loc->nodeList; + PGXCNodeHandle **connections; + bool need_tran_block; + GlobalTransactionId gxid; + ResponseCombiner combiner; + Snapshot snapshot = GetActiveSnapshot(); + int conn_count = list_length(nodelist); + + /* Get needed datanode connections */ + if (!rcstate->is_from && IsLocatorReplicated(rcstate->rel_loc->locatorType)) + { + /* Connections is a single handle to read from */ + connections = (PGXCNodeHandle **) palloc(sizeof(PGXCNodeHandle *)); + connections[0] = get_any_handle(nodelist); + conn_count = 1; + } + else + { + PGXCNodeAllHandles *pgxc_handles; + pgxc_handles = get_handles(nodelist, NULL, false); + connections = pgxc_handles->datanode_handles; + Assert(pgxc_handles->dn_conn_count == conn_count); + pfree(pgxc_handles); + } + + /* + * If more than one nodes are involved or if we are already in a + * transaction block, we must the remote statements in a transaction block + */ + need_tran_block = (conn_count > 1) || (TransactionBlockStatusCode() == 'T'); + + elog(DEBUG1, "conn_count = %d, need_tran_block = %s", conn_count, + need_tran_block ? "true" : "false"); + + /* Gather statistics */ + stat_statement(); + stat_transaction(conn_count); + + gxid = GetCurrentTransactionId(); + + /* Start transaction on connections where it is not started */ + if (pgxc_node_begin(conn_count, connections, gxid, need_tran_block, false, PGXC_NODE_DATANODE)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on data nodes."))); + } + + /* + * COPY TO do not use locator, it just takes connections from it, and + * we do not look up distribution data type in this case. + * So always use LOCATOR_TYPE_RROBIN to avoid errors because of not + * defined partType if real locator type is HASH or MODULO. + * Create locator before sending down query, because createLocator may + * fail and we leave with dirty connections. + * If we get an error now datanode connection will be clean and error + * handler will issue transaction abort. + */ + rcstate->locator = createLocator( + rcstate->is_from ? rcstate->rel_loc->locatorType + : LOCATOR_TYPE_RROBIN, + rcstate->is_from ? RELATION_ACCESS_INSERT : RELATION_ACCESS_READ, + rcstate->dist_type, + LOCATOR_LIST_POINTER, + conn_count, + (void *) connections, + NULL, + false); + + /* Send query to nodes */ + for (i = 0; i < conn_count; i++) + { + CHECK_OWNERSHIP(connections[i], NULL); + + if (snapshot && pgxc_node_send_snapshot(connections[i], snapshot)) + { + add_error_message(connections[i], "Can not send request"); + pfree(connections); + freeLocator(rcstate->locator); + rcstate->locator = NULL; + return; + } + if (pgxc_node_send_query(connections[i], rcstate->query_buf.data) != 0) + { + add_error_message(connections[i], "Can not send request"); + pfree(connections); + freeLocator(rcstate->locator); + rcstate->locator = NULL; + return; + } + } + + /* + * We are expecting CopyIn response, but do not want to send it to client, + * caller should take care about this, because here we do not know if + * client runs console or file copy + */ + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + + /* Receive responses */ + if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) + || !ValidateAndCloseCombiner(&combiner)) + { + DataNodeCopyFinish(conn_count, connections); + freeLocator(rcstate->locator); + rcstate->locator = NULL; + return; + } + pfree(connections); +} +#else PGXCNodeHandle** DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot) { @@ -2127,6 +4095,10 @@ DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot) foreach(nodeitem, nodelist) copy_connections[lfirst_int(nodeitem)] = connections[i++]; + /* Gather statistics */ + stat_statement(); + stat_transaction(conn_count); + gxid = GetCurrentTransactionId(); if (!GlobalTransactionIdIsValid(gxid)) @@ -2185,10 +4157,91 @@ DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot) pfree(connections); return copy_connections; } +#endif + /* * Send a data row to the specified nodes */ +#ifdef XCP +int +DataNodeCopyIn(char *data_row, int len, int conn_count, PGXCNodeHandle** copy_connections) +{ + /* size + data row + \n */ + int msgLen = 4 + len + 1; + int nLen = htonl(msgLen); + int i; + + for(i = 0; i < conn_count; i++) + { + PGXCNodeHandle *handle = copy_connections[i]; + if (handle->state == DN_CONNECTION_STATE_COPY_IN) + { + /* precalculate to speed up access */ + int bytes_needed = handle->outEnd + 1 + msgLen; + + /* flush buffer if it is almost full */ + if (bytes_needed > COPY_BUFFER_SIZE) + { + int to_send = handle->outEnd; + + /* First look if data node has sent a error message */ + int read_status = pgxc_node_read_data(handle, true); + if (read_status == EOF || read_status < 0) + { + add_error_message(handle, "failed to read data from data node"); + return EOF; + } + + if (handle->inStart < handle->inEnd) + { + ResponseCombiner combiner; + InitResponseCombiner(&combiner, 1, COMBINE_TYPE_NONE); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + handle_response(handle, &combiner); + if (!ValidateAndCloseCombiner(&combiner)) + return EOF; + } + + if (DN_CONNECTION_STATE_ERROR(handle)) + return EOF; + + /* + * Try to send down buffered data if we have + */ + if (to_send && send_some(handle, to_send) < 0) + { + add_error_message(handle, "failed to send data to data node"); + return EOF; + } + } + + if (ensure_out_buffer_capacity(bytes_needed, handle) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + handle->outBuffer[handle->outEnd++] = 'd'; + memcpy(handle->outBuffer + handle->outEnd, &nLen, 4); + handle->outEnd += 4; + memcpy(handle->outBuffer + handle->outEnd, data_row, len); + handle->outEnd += len; + handle->outBuffer[handle->outEnd++] = '\n'; + } + else + { + add_error_message(handle, "Invalid data node connection"); + return EOF; + } + } + return 0; +} +#else int DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections) { @@ -2340,7 +4393,81 @@ DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** } return 0; } +#endif + + +#ifdef XCP +uint64 +DataNodeCopyOut(PGXCNodeHandle** copy_connections, + int conn_count, FILE* copy_file) +{ + ResponseCombiner combiner; + uint64 processed; + bool error; + + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_SUM); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + combiner.processed = 0; + /* If there is an existing file where to copy data, pass it to combiner */ + if (copy_file) + { + combiner.copy_file = copy_file; + combiner.remoteCopyType = REMOTE_COPY_FILE; + } + else + { + combiner.copy_file = NULL; + combiner.remoteCopyType = REMOTE_COPY_STDOUT; + } + error = (pgxc_node_receive_responses(conn_count, copy_connections, NULL, &combiner) != 0); + + processed = combiner.processed; + + if (!ValidateAndCloseCombiner(&combiner) || error) + { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("Unexpected response from the data nodes when combining, request type %d", combiner.request_type))); + } + + return processed; +} + + +uint64 +DataNodeCopyStore(PGXCNodeHandle** copy_connections, + int conn_count, Tuplestorestate* store) +{ + ResponseCombiner combiner; + uint64 processed; + bool error; + + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_SUM); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + combiner.processed = 0; + combiner.remoteCopyType = REMOTE_COPY_TUPLESTORE; + combiner.tuplestorestate = store; + + error = (pgxc_node_receive_responses(conn_count, copy_connections, NULL, &combiner) != 0); + processed = combiner.processed; + + if (!ValidateAndCloseCombiner(&combiner) || error) + { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("Unexpected response from the data nodes when combining, request type %d", combiner.request_type))); + } + + return processed; +} +#else uint64 DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, @@ -2416,16 +4543,28 @@ DataNodeCopyOut(ExecNodes *exec_nodes, return processed; } +#endif + /* * Finish copy process on all connections */ +#ifdef XCP +void +DataNodeCopyFinish(int conn_count, PGXCNodeHandle** connections) +#else void DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_dn_index, CombineType combine_type) +#endif { int i; +#ifdef XCP + ResponseCombiner combiner; +#else RemoteQueryState *combiner = NULL; +#endif bool error = false; +#ifndef XCP struct timeval *timeout = NULL; /* wait forever */ PGXCNodeHandle *connections[NumDataNodes]; PGXCNodeHandle *primary_handle = NULL; @@ -2453,6 +4592,7 @@ DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_dn_index, Comb combiner = CreateResponseCombiner(conn_count + 1, combine_type); error = (pgxc_node_receive_responses(1, &primary_handle, timeout, combiner) != 0) || error; } +#endif for (i = 0; i < conn_count; i++) { @@ -2463,11 +4603,22 @@ DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_dn_index, Comb error = DataNodeCopyEnd(handle, false); } +#ifdef XCP + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + error = (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) != 0) || error; + + if (!ValidateAndCloseCombiner(&combiner) || error) +#else if (!combiner) combiner = CreateResponseCombiner(conn_count, combine_type); error = (pgxc_node_receive_responses(conn_count, connections, timeout, combiner) != 0) || error; if (!ValidateAndCloseCombiner(combiner) || error) +#endif ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Error while running COPY"))); @@ -2503,6 +4654,8 @@ DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error) return false; } + +#ifndef XCP RemoteQueryState * ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) { @@ -2555,7 +4708,12 @@ ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) * If there are parameters supplied, get them into a form to be sent to the * Datanodes with bind message. We should not have had done this before. */ - SetDataRowForExtParams(estate->es_param_list_info, remotestate); + if (estate->es_param_list_info) + { + Assert(!remotestate->paramval_data); + remotestate->paramval_len = ParamListToDataRow(estate->es_param_list_info, + &remotestate->paramval_data); + } /* * Initialize result tuple type and projection info. @@ -2571,6 +4729,8 @@ ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) return remotestate; } +#endif + /* * Get Node connections depending on the connection type: @@ -2596,8 +4756,14 @@ get_exec_connections(RemoteQueryState *planstate, if (exec_type == EXEC_ON_COORDS) is_query_coord_only = true; +#ifdef XCP + if (exec_type == EXEC_ON_CURRENT) + return get_current_handles(); +#endif + if (exec_nodes) { +#ifndef XCP if (exec_nodes->en_expr) { /* execution time determining of target Datanodes */ @@ -2644,10 +4810,7 @@ get_exec_connections(RemoteQueryState *planstate, else if (nodes) { if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES) - { nodelist = exec_nodes->nodeList; - primarynode = exec_nodes->primarynodelist; - } } if (nodes) @@ -2655,6 +4818,7 @@ get_exec_connections(RemoteQueryState *planstate, FreeRelationLocInfo(rel_loc_info); } else +#endif { if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES) nodelist = exec_nodes->nodeList; @@ -2734,22 +4898,31 @@ get_exec_connections(RemoteQueryState *planstate, return pgxc_handles; } + static bool pgxc_start_command_on_connection(PGXCNodeHandle *connection, RemoteQueryState *remotestate, Snapshot snapshot) { CommandId cid; +#ifdef XCP + ResponseCombiner *combiner = (ResponseCombiner *) remotestate; + RemoteQuery *step = (RemoteQuery *) combiner->ss.ps.plan; + CHECK_OWNERSHIP(connection, combiner); +#else RemoteQuery *step = (RemoteQuery *) remotestate->ss.ps.plan; if (connection->state == DN_CONNECTION_STATE_QUERY) BufferConnection(connection); +#endif /* * Scan descriptor would be valid and would contain a valid snapshot * in cases when we need to send out of order command id to data node * e.g. in case of a fetch */ - +#ifdef XCP + cid = GetCurrentCommandId(false); +#else if (remotestate->cursor != NULL && remotestate->cursor[0] != '\0' && remotestate->ss.ss_currentScanDesc != NULL && @@ -2770,29 +4943,26 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection, else cid = GetCurrentCommandId(false); } +#endif if (pgxc_node_send_cmd_id(connection, cid) < 0 ) return false; if (snapshot && pgxc_node_send_snapshot(connection, snapshot)) return false; - if (step->statement || step->cursor || remotestate->rqs_num_params) + if (step->statement || step->cursor || step->remote_param_types) { /* need to use Extended Query Protocol */ int fetch = 0; bool prepared = false; - bool send_desc = false; - - if (step->base_tlist != NULL || - step->exec_nodes->accesstype == RELATION_ACCESS_READ || - step->has_row_marks) - send_desc = true; +#ifndef XCP /* if prepared statement is referenced see if it is already exist */ if (step->statement) prepared = ActivateDatanodeStatementOnNode(step->statement, PGXCNodeGetNodeId(connection->nodeoid, PGXC_NODE_DATANODE)); +#endif /* * execute and fetch rows only if they will be consumed * immediately by the sorter @@ -2800,15 +4970,19 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection, if (step->cursor) fetch = 1; +#ifdef XCP + combiner->extended_query = true; +#endif + if (pgxc_node_send_query_extended(connection, prepared ? NULL : step->sql_statement, step->statement, step->cursor, - remotestate->rqs_num_params, - remotestate->rqs_param_types, + step->remote_num_params, + step->remote_param_types, remotestate->paramval_len, remotestate->paramval_data, - send_desc, + step->has_row_marks ? true : step->read_only, fetch) != 0) return false; } @@ -2821,31 +4995,8 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection, } -/* - * IsReturningDMLOnReplicatedTable - * - * This function returns true if the passed RemoteQuery - * 1. Operates on a table that is replicated - * 2. Represents a DML - * 3. Has a RETURNING clause in it - * - * If the passed RemoteQuery has a non null base_tlist - * means that DML has a RETURNING clause. - */ - -static bool -IsReturningDMLOnReplicatedTable(RemoteQuery *rq) -{ - if (IsExecNodesReplicated(rq->exec_nodes) && - rq->base_tlist != NULL && /* Means DML has RETURNING */ - (rq->exec_nodes->accesstype == RELATION_ACCESS_UPDATE || - rq->exec_nodes->accesstype == RELATION_ACCESS_INSERT)) - return true; - - return false; -} - -void +#ifndef XCP +static void do_query(RemoteQueryState *node) { RemoteQuery *step = (RemoteQuery *) node->ss.ps.plan; @@ -2857,18 +5008,12 @@ do_query(RemoteQueryState *node) PGXCNodeHandle **connections = NULL; PGXCNodeHandle *primaryconnection = NULL; int i; - int regular_conn_count = 0; + int regular_conn_count; + int total_conn_count; bool need_tran_block; PGXCNodeAllHandles *pgxc_connections; /* - * A Postgres-XC node cannot run transactions while in recovery as - * this operation needs transaction IDs. This is more a safety guard than anything else. - */ - if (RecoveryInProgress()) - elog(ERROR, "cannot run transaction to remote nodes during recovery"); - - /* * Remember if the remote query is accessing a temp object * * !! PGXC TODO Check if the is_temp flag is propogated correctly when a @@ -2878,22 +5023,6 @@ do_query(RemoteQueryState *node) ExecSetTempObjectIncluded(); /* - * Consider a test case - * - * create table rf(a int, b int) distributed by replication; - * insert into rf values(1,2),(3,4) returning ctid; - * - * While inserting the first row do_query works fine, receives the returned - * row from the first connection and returns it. In this iteration the other - * datanodes also had returned rows but they have not yet been read from the - * network buffers. On Next Iteration do_query does not enter the data - * receiving loop because it finds that node->connections is not null. - * It is therefore required to set node->connections to null here. - */ - if (node->conn_count == 0) - node->connections = NULL; - - /* * Get connections for Datanodes only, utilities and DDLs * are launched in ExecRemoteUtility */ @@ -2902,17 +5031,19 @@ do_query(RemoteQueryState *node) if (step->exec_type == EXEC_ON_DATANODES) { connections = pgxc_connections->datanode_handles; - regular_conn_count = pgxc_connections->dn_conn_count; + total_conn_count = regular_conn_count = pgxc_connections->dn_conn_count; } else if (step->exec_type == EXEC_ON_COORDS) { connections = pgxc_connections->coord_handles; - regular_conn_count = pgxc_connections->co_conn_count; + total_conn_count = regular_conn_count = pgxc_connections->co_conn_count; } primaryconnection = pgxc_connections->primary_handle; - /* Primary connection is counted separately */ + /* + * Primary connection is counted separately but is included in total_conn_count if used. + */ if (primaryconnection) regular_conn_count--; @@ -2951,6 +5082,9 @@ do_query(RemoteQueryState *node) "need_tran_block = %s", primaryconnection ? "true" : "false", regular_conn_count, need_tran_block ? "true" : "false"); + stat_statement(); + stat_transaction(total_conn_count); + gxid = GetCurrentTransactionId(); if (!GlobalTransactionIdIsValid(gxid)) @@ -2992,30 +5126,6 @@ do_query(RemoteQueryState *node) res = handle_response(primaryconnection, node); if (res == RESPONSE_COMPLETE) break; - else if (res == RESPONSE_TUPDESC) - { - ExecSetSlotDescriptor(scanslot, node->tuple_desc); - /* - * Now tuple table slot is responsible for freeing the - * descriptor - */ - node->tuple_desc = NULL; - /* - * RemoteQuery node doesn't support backward scan, so - * randomAccess is false, neither we want this tuple store - * persist across transactions. - */ - node->tuplestorestate = tuplestore_begin_heap(false, false, work_mem); - tuplestore_set_eflags(node->tuplestorestate, node->eflags); - } - else if (res == RESPONSE_DATAROW) - { - pfree(node->currentRow.msg); - node->currentRow.msg = NULL; - node->currentRow.msglen = 0; - node->currentRow.msgnode = 0; - continue; - } else if (res == RESPONSE_EOF) continue; else @@ -3106,15 +5216,33 @@ do_query(RemoteQueryState *node) * descriptor */ node->tuple_desc = NULL; - if (node->rqs_for_sort) + if (step->sort) { + SimpleSort *sort = step->sort; + + node->connections = connections; + node->conn_count = regular_conn_count; /* * First message is already in the buffer - * Further fetch will be under the control of Sort plan - * above. So, don't wait till first row is fetched. + * Further fetch will be under tuplesort control + * If query does not produce rows tuplesort will not + * be initialized + */ + node->tuplesortstate = tuplesort_begin_merge( + scanslot->tts_tupleDescriptor, + sort->numCols, + sort->sortColIdx, + sort->sortOperators, + sort->sortCollations, + sort->nullsFirst, + node, + work_mem); + /* + * Break the loop, do not wait for first row. + * Tuplesort module want to control node it is + * fetching rows from, while in this loop first + * row would be got from random node */ - node->connections = connections; - node->conn_count = regular_conn_count; break; } else @@ -3145,15 +5273,6 @@ do_query(RemoteQueryState *node) } /* report error if any */ pgxc_node_report_error(node); - - if (node->rqs_for_sort) - { - /* - * Break the loop, do not wait for first row. See comment above for - * rqs_for_sort. - */ - break; - } } if (node->cursor_count) @@ -3223,6 +5342,13 @@ RemoteQueryNext(ScanState *scan_node) node->update_cursor = NULL; pfree_pgxc_all_handles(all_dn_handles); } + + /* We can't have both tuplesortstate and tuplestorestate */ + Assert(!(node->tuplesortstate && node->tuplestorestate)); + + if (node->tuplesortstate) + tuplesort_gettupleslot((Tuplesortstate *) node->tuplesortstate, + true, scanslot); else if(node->tuplestorestate) { /* @@ -3242,50 +5368,7 @@ RemoteQueryNext(ScanState *scan_node) eof_tuplestore = true; } - /* - * Consider a test case - * - * create table ta1 (v1 int, v2 int); - * insert into ta1 values(1,2),(2,3),(3,4); - * - * create table ta2 (v1 int, v2 int); - * insert into ta2 values(1,2),(2,3),(3,4); - * - * select t1.ctid, t2.ctid,* from ta1 t1, ta2 t2 - * where t2.v2<=3 order by t1.v1; - * ctid | ctid | v1 | v2 | v1 | v2 - * -------+-------+----+----+----+---- - * Row_1 (0,1) | (0,1) | 1 | 2 | 1 | 2 - * Row_2 (0,1) | (0,2) | 1 | 2 | 2 | 3 - * Row_3 (0,2) | (0,1) | 2 | 3 | 1 | 2 - * Row_4 (0,2) | (0,2) | 2 | 3 | 2 | 3 - * Row_5 (0,1) | (0,1) | 3 | 4 | 1 | 2 - * Row_6 (0,1) | (0,2) | 3 | 4 | 2 | 3 - * (6 rows) - * - * Note that in the resulting join, we are getting one row of ta1 twice, - * as shown by the ctid's in the results. Now consider this update - * - * update ta1 t1 set v2=t1.v2+10 from ta2 t2 - * where t2.v2<=3 returning t1.ctid,t1.v1 t1_v1, t1.v2 t1_v2; - * - * The first iteration of the update runs for Row_1, succeeds and - * updates its ctid to say (0,3). In the second iteration for Row_2, - * since the ctid of the row has already changed, fails to update any - * row and hence do_query does not return any tuple. The FetchTuple - * call in RemoteQueryNext hence fails and eof_underlying is set to true. - * However in the third iteration for Row_3, the update succeeds and - * returns a row, but since the eof_underlying is already set to true, - * the RemoteQueryNext does not bother calling FetchTuple, we therefore - * do not get more than one row returned as a result of the update - * returning query. It is therefore required in RemoteQueryNext to call - * FetchTuple in case do_query has copied a row in node->currentRow.msg. - * Also we have to reset the eof_underlying flag every time - * FetchTuple succeeds to clear any previously set status. - */ - if (eof_tuplestore && - (!node->eof_underlying || - (node->currentRow.msg != NULL))) + if (eof_tuplestore && !node->eof_underlying) { /* * If tuplestore has reached its end but the underlying RemoteQueryNext() hasn't @@ -3293,15 +5376,13 @@ RemoteQueryNext(ScanState *scan_node) */ if (FetchTuple(node, scanslot)) { - /* See comments a couple of lines above */ - node->eof_underlying = false; - /* - * Append a copy of the returned tuple to tuplestore. NOTE: because - * the tuplestore is certainly in EOF state, its read position will - * move forward over the added tuple. This is what we want. - */ - if (tuplestorestate && !TupIsNull(scanslot)) - tuplestore_puttupleslot(tuplestorestate, scanslot); + /* + * Append a copy of the returned tuple to tuplestore. NOTE: because + * the tuplestore is certainly in EOF state, its read position will + * move forward over the added tuple. This is what we want. + */ + if (tuplestorestate && !TupIsNull(scanslot)) + tuplestore_puttupleslot(tuplestorestate, scanslot); } else node->eof_underlying = true; @@ -3310,8 +5391,7 @@ RemoteQueryNext(ScanState *scan_node) if (eof_tuplestore && node->eof_underlying) ExecClearTuple(scanslot); } - else if (node->rqs_for_sort) - getrow_for_tapesort(node, scanslot); + /* No tuple store whatsoever, no result from the datanode */ else ExecClearTuple(scanslot); @@ -3368,9 +5448,13 @@ ExecEndRemoteQuery(RemoteQueryState *node) if (res == RESPONSE_EOF) { struct timeval timeout; +#ifdef XCP + timeout.tv_sec = END_QUERY_TIMEOUT / 1000; + timeout.tv_usec = (END_QUERY_TIMEOUT % 1000) * 1000; +#else timeout.tv_sec = END_QUERY_TIMEOUT; timeout.tv_usec = 0; - +#endif if (pgxc_node_receive(1, &conn, &timeout)) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), @@ -3378,9 +5462,15 @@ ExecEndRemoteQuery(RemoteQueryState *node) } } - if (node->tuplestorestate != NULL) + if (node->tuplesortstate != NULL || node->tuplestorestate != NULL) ExecClearTuple(node->ss.ss_ScanTupleSlot); /* + * Release tuplesort resources + */ + if (node->tuplesortstate != NULL) + tuplesort_end((Tuplesortstate *) node->tuplesortstate); + node->tuplesortstate = NULL; + /* * Release tuplestore resources */ if (node->tuplestorestate != NULL) @@ -3441,18 +5531,12 @@ ExecEndRemoteQuery(RemoteQueryState *node) node->paramval_len = 0; } - /* Free the param types if they are newly allocated */ - if (node->rqs_param_types && - node->rqs_param_types != ((RemoteQuery*)node->ss.ps.plan)->rq_param_types) - { - pfree(node->rqs_param_types); - node->rqs_param_types = NULL; - node->rqs_num_params = 0; - } - if (node->ss.ss_currentRelation) ExecCloseScanRelation(node->ss.ss_currentRelation); + if (node->tmp_ctx) + MemoryContextDelete(node->tmp_ctx); + CloseCombiner(node); } @@ -3506,52 +5590,35 @@ close_node_cursors(PGXCNodeHandle **connections, int conn_count, char *cursor) ValidateAndCloseCombiner(combiner); } +#endif /* * Encode parameter values to format of DataRow message (the same format is * used in Bind) to prepare for sending down to Datanodes. - * The data row is copied to RemoteQueryState.paramval_data. + * The buffer to store encoded value is palloc'ed and returned as the result + * parameter. Function returns size of the result */ -void -SetDataRowForExtParams(ParamListInfo paraminfo, RemoteQueryState *rq_state) +int +ParamListToDataRow(ParamListInfo params, char** result) { StringInfoData buf; uint16 n16; int i; int real_num_params = 0; - RemoteQuery *node = (RemoteQuery*) rq_state->ss.ps.plan; - - /* If there are no parameters, there is no data to BIND. */ - if (!paraminfo) - return; - - /* - * If this query has been generated internally as a part of two-step DML - * statement, it uses only the internal parameters for input values taken - * from the source data, and it never uses external parameters. So even if - * parameters were being set externally, they won't be present in this - * statement (they might be present in the source data query). In such - * case where parameters refer to the values returned by SELECT query, the - * parameter data and parameter types would be set in SetDataRowForIntParams(). - */ - if (node->rq_params_internal) - return; - - Assert(!rq_state->paramval_data); /* * It is necessary to fetch parameters * before looking at the output value. */ - for (i = 0; i < paraminfo->numParams; i++) + for (i = 0; i < params->numParams; i++) { ParamExternData *param; - param = ¶minfo->params[i]; + param = ¶ms->params[i]; - if (!OidIsValid(param->ptype) && paraminfo->paramFetch != NULL) - (*paraminfo->paramFetch) (paraminfo, i + 1); + if (!OidIsValid(param->ptype) && params->paramFetch != NULL) + (*params->paramFetch) (params, i + 1); /* * This is the last parameter found as useful, so we need @@ -3570,9 +5637,8 @@ SetDataRowForExtParams(ParamListInfo paraminfo, RemoteQueryState *rq_state) */ if (real_num_params == 0) { - rq_state->paramval_data = NULL; - rq_state->paramval_len = 0; - return; + *result = NULL; + return 0; } initStringInfo(&buf); @@ -3584,7 +5650,7 @@ SetDataRowForExtParams(ParamListInfo paraminfo, RemoteQueryState *rq_state) /* Parameter values */ for (i = 0; i < real_num_params; i++) { - ParamExternData *param = ¶minfo->params[i]; + ParamExternData *param = ¶ms->params[i]; uint32 n32; /* @@ -3627,38 +5693,15 @@ SetDataRowForExtParams(ParamListInfo paraminfo, RemoteQueryState *rq_state) } } - - /* - * If parameter types are not already set, infer them from - * the paraminfo. - */ - if (node->rq_num_params > 0) - { - /* - * Use the already known param types for BIND. Parameter types - * can be already known when the same plan is executed multiple - * times. - */ - if (node->rq_num_params != real_num_params) - elog(ERROR, "Number of user-supplied parameters do not match " - "the number of remote parameters"); - rq_state->rqs_num_params = node->rq_num_params; - rq_state->rqs_param_types = node->rq_param_types; - } - else - { - rq_state->rqs_num_params = real_num_params; - rq_state->rqs_param_types = (Oid *) palloc(sizeof(Oid) * real_num_params); - for (i = 0; i < real_num_params; i++) - rq_state->rqs_param_types[i] = paraminfo->params[i].ptype; - } - - /* Assign the newly allocated data row to paramval */ - rq_state->paramval_data = buf.data; - rq_state->paramval_len = buf.len; + /* Take data from the buffer */ + *result = palloc(buf.len); + memcpy(*result, buf.data, buf.len); + pfree(buf.data); + return buf.len; } +#ifndef XCP /* ---------------------------------------------------------------- * ExecRemoteQueryReScan * @@ -3673,11 +5716,23 @@ ExecRemoteQueryReScan(RemoteQueryState *node, ExprContext *exprCtxt) */ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); - if (!node->tuplestorestate) - return; + if (((RemoteQuery *) node->ss.ps.plan)->sort) + { + if (!node->tuplesortstate) + return; + + tuplesort_rescan(node->tuplesortstate); + } + else + { + if (!node->tuplestorestate) + return; + + tuplestore_rescan(node->tuplestorestate); + } - tuplestore_rescan(node->tuplestorestate); } +#endif /* @@ -3695,10 +5750,17 @@ void ExecRemoteUtility(RemoteQuery *node) { RemoteQueryState *remotestate; +#ifdef XCP + ResponseCombiner *combiner; +#endif bool force_autocommit = node->force_autocommit; RemoteQueryExecType exec_type = node->exec_type; GlobalTransactionId gxid = InvalidGlobalTransactionId; +#ifdef XCP + Snapshot snapshot = NULL; +#else Snapshot snapshot = GetActiveSnapshot(); +#endif PGXCNodeAllHandles *pgxc_connections; int co_conn_count; int dn_conn_count; @@ -3709,6 +5771,11 @@ ExecRemoteUtility(RemoteQuery *node) if (!force_autocommit) RegisterTransactionLocalNode(true); +#ifdef XCP + remotestate = makeNode(RemoteQueryState); + combiner = (ResponseCombiner *)remotestate; + InitResponseCombiner(combiner, 0, node->combine_type); +#else /* * It is possible to invoke create table with inheritance on * temporary objects. Remember that we might have accessed a temp object @@ -3717,11 +5784,17 @@ ExecRemoteUtility(RemoteQuery *node) ExecSetTempObjectIncluded(); remotestate = CreateResponseCombiner(0, node->combine_type); +#endif pgxc_connections = get_exec_connections(NULL, node->exec_nodes, exec_type); dn_conn_count = pgxc_connections->dn_conn_count; co_conn_count = pgxc_connections->co_conn_count; +#ifdef XCP + /* exit right away if no nodes to run command on */ + if (dn_conn_count == 0 && co_conn_count == 0) + return; +#endif if (force_autocommit) need_tran_block = false; @@ -3741,12 +5814,18 @@ ExecRemoteUtility(RemoteQuery *node) } gxid = GetCurrentTransactionId(); +#ifdef XCP + if (ActiveSnapshotSet()) + snapshot = GetActiveSnapshot(); +#endif if (!GlobalTransactionIdIsValid(gxid)) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to get next transaction ID"))); +#ifndef XCP if (exec_type == EXEC_ON_ALL_NODES || exec_type == EXEC_ON_DATANODES) +#endif { if (pgxc_node_begin(dn_conn_count, pgxc_connections->datanode_handles, gxid, need_tran_block, false, PGXC_NODE_DATANODE)) @@ -3774,7 +5853,9 @@ ExecRemoteUtility(RemoteQuery *node) } } +#ifndef XCP if (exec_type == EXEC_ON_ALL_NODES || exec_type == EXEC_ON_COORDS) +#endif { if (pgxc_node_begin(co_conn_count, pgxc_connections->coord_handles, gxid, need_tran_block, false, PGXC_NODE_COORDINATOR)) @@ -3803,8 +5884,10 @@ ExecRemoteUtility(RemoteQuery *node) * Stop if all commands are completed or we got a data row and * initialized state node for subsequent invocations */ +#ifndef XCP if (exec_type == EXEC_ON_ALL_NODES || exec_type == EXEC_ON_DATANODES) +#endif { while (dn_conn_count > 0) { @@ -3822,12 +5905,26 @@ ExecRemoteUtility(RemoteQuery *node) while (i < dn_conn_count) { PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i]; +#ifdef XCP + int res = handle_response(conn, combiner); +#else int res = handle_response(conn, remotestate); +#endif if (res == RESPONSE_EOF) { i++; } else if (res == RESPONSE_COMPLETE) +#ifdef XCP + { + /* Ignore, wait for ReadyForQuery */ + } + else if (res == RESPONSE_ERROR) + { + /* Ignore, wait for ReadyForQuery */ + } + else if (res == RESPONSE_READY) +#endif { if (i < --dn_conn_count) pgxc_connections->datanode_handles[i] = @@ -3850,8 +5947,10 @@ ExecRemoteUtility(RemoteQuery *node) } /* Make the same for Coordinators */ +#ifndef XCP if (exec_type == EXEC_ON_ALL_NODES || exec_type == EXEC_ON_COORDS) +#endif { while (co_conn_count > 0) { @@ -3862,12 +5961,26 @@ ExecRemoteUtility(RemoteQuery *node) while (i < co_conn_count) { +#ifdef XCP + int res = handle_response(pgxc_connections->coord_handles[i], combiner); +#else int res = handle_response(pgxc_connections->coord_handles[i], remotestate); +#endif if (res == RESPONSE_EOF) { i++; } else if (res == RESPONSE_COMPLETE) +#ifdef XCP + { + /* Ignore, wait for ReadyForQuery */ + } + else if (res == RESPONSE_ERROR) + { + /* Ignore, wait for ReadyForQuery */ + } + else if (res == RESPONSE_READY) +#endif { if (i < --co_conn_count) pgxc_connections->coord_handles[i] = @@ -3893,7 +6006,11 @@ ExecRemoteUtility(RemoteQuery *node) * error message pending we can report it. All connections should be in * consistent state now and so they can be released to the pool after ROLLBACK. */ +#ifdef XCP + pgxc_node_report_error(combiner); +#else pgxc_node_report_error(remotestate); +#endif } @@ -3903,19 +6020,26 @@ ExecRemoteUtility(RemoteQuery *node) void PGXCNodeCleanAndRelease(int code, Datum arg) { +#ifndef XCP /* Clean up prepared transactions before releasing connections */ DropAllPreparedStatements(); /* Release Datanode connections */ release_handles(); +#endif - /* Disconnect from Pooler */ + /* Disconnect from Pooler, if any connection is still held Pooler close it */ PoolManagerDisconnect(); /* Close connection with GTM */ CloseGTM(); + + /* Dump collected statistics to the log */ + stat_log(); } + +#ifndef XCP static int pgxc_get_connections(PGXCNodeHandle *connections[], int size, List *connlist) { @@ -3939,13 +6063,19 @@ pgxc_get_transaction_nodes(PGXCNodeHandle *connections[], int size, bool write) { return pgxc_get_connections(connections, size, write ? XactWriteNodes : XactReadNodes); } +#endif + void ExecCloseRemoteStatement(const char *stmt_name, List *nodelist) { PGXCNodeAllHandles *all_handles; PGXCNodeHandle **connections; +#ifdef XCP + ResponseCombiner combiner; +#else RemoteQueryState *combiner; +#endif int conn_count; int i; @@ -3983,7 +6113,15 @@ ExecCloseRemoteStatement(const char *stmt_name, List *nodelist) } } +#ifdef XCP + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); +#else combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE); +#endif while (conn_count > 0) { @@ -3999,24 +6137,39 @@ ExecCloseRemoteStatement(const char *stmt_name, List *nodelist) i = 0; while (i < conn_count) { +#ifdef XCP + int res = handle_response(connections[i], &combiner); +#else int res = handle_response(connections[i], combiner); +#endif if (res == RESPONSE_EOF) { i++; } +#ifdef XCP + else if (res == RESPONSE_READY || + connections[i]->state == DN_CONNECTION_STATE_ERROR_FATAL) +#else else if (res == RESPONSE_COMPLETE) +#endif { if (--conn_count > i) connections[i] = connections[conn_count]; } +#ifndef XCP else { connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL; } +#endif } } +#ifdef XCP + ValidateAndCloseCombiner(&combiner); +#else ValidateAndCloseCombiner(combiner); +#endif pfree_pgxc_all_handles(all_handles); } @@ -4025,14 +6178,23 @@ ExecCloseRemoteStatement(const char *stmt_name, List *nodelist) * * In a COPY TO, send to all Datanodes PG_HEADER for a COPY TO in binary mode. */ +#ifdef XCP +int +DataNodeCopyInBinaryForAll(char *msg_buf, int len, int conn_count, + PGXCNodeHandle** connections) +#else int DataNodeCopyInBinaryForAll(char *msg_buf, int len, PGXCNodeHandle** copy_connections) +#endif { int i; +#ifndef XCP int conn_count = 0; PGXCNodeHandle *connections[NumDataNodes]; +#endif int msgLen = 4 + len + 1; int nLen = htonl(msgLen); +#ifndef XCP for (i = 0; i < NumDataNodes; i++) { PGXCNodeHandle *handle = copy_connections[i]; @@ -4042,6 +6204,7 @@ int DataNodeCopyInBinaryForAll(char *msg_buf, int len, PGXCNodeHandle** copy_con connections[conn_count++] = handle; } +#endif for (i = 0; i < conn_count; i++) { @@ -4073,6 +6236,7 @@ int DataNodeCopyInBinaryForAll(char *msg_buf, int len, PGXCNodeHandle** copy_con return 0; } +#ifndef XCP /* * ExecSetTempObjectIncluded * @@ -4106,138 +6270,38 @@ ExecIsTempObjectIncluded(void) } /* - * ExecProcNodeDMLInXC - * - * This function is used by ExecInsert/Update/Delete to execute the - * Insert/Update/Delete on the datanode using RemoteQuery plan. - * - * In XC, a non-FQSed UPDATE/DELETE is planned as a two step process - * The first step selects the ctid & node id of the row to be modified and the - * second step creates a parameterized query that is supposed to take the data - * row returned by the lower plan node as the parameters to modify the affected - * row. In case of an INSERT however the first step is used to get the new - * column values to be inserted in the target table and the second step uses - * those values as parameters of the INSERT query. - * - * We use extended query protocol to avoid repeated planning of the query and - * pass the column values(in case of an INSERT) and ctid & xc_node_id - * (in case of UPDATE/DELETE) as parameters while executing the query. - * - * Parameters: - * resultRemoteRel: The RemoteQueryState containing DML statement to be - * executed - * previousStepSlot: The tuple returned by the first step (described above) - * to be used as parameters in the second step. - * - * Returns the result of RETURNING clause if any + * Execute given tuple in the remote relation. We use extended query protocol + * to avoid repeated planning of the query. So we must pass the column values + * as parameters while executing the query. + * This is used by queries using a remote query planning of standard planner. */ -TupleTableSlot * -ExecProcNodeDMLInXC(RemoteQueryState *resultRemoteRel, - TupleTableSlot *previousStepSlot) +void +ExecRemoteQueryStandard(Relation resultRelationDesc, + RemoteQueryState *resultRemoteRel, + TupleTableSlot *slot) { - ExprContext *econtext = resultRemoteRel->ss.ps.ps_ExprContext; - TupleTableSlot *returningResultSlot = NULL; /* RETURNING clause result */ - TupleTableSlot *temp_slot; - bool dml_returning_on_replicated = false; - RemoteQuery *step = (RemoteQuery *) resultRemoteRel->ss.ps.plan; - - /* - * If the tuple returned by the previous step was null, - * simply return null tuple, no need to execute the DML - */ - if (TupIsNull(previousStepSlot)) - return NULL; - - /* - * The current implementation of DMLs with RETURNING when run on replicated - * tables returns row from one of the datanodes. In order to achieve this - * ExecProcNode is repeatedly called saving one tuple and rejecting the rest. - * Do we have a DML on replicated table with RETURNING? - */ - dml_returning_on_replicated = IsReturningDMLOnReplicatedTable(step); - - /* - * Use data row returned by the previous step as parameter for - * the DML to be executed in this step. - */ - SetDataRowForIntParams(previousStepSlot, resultRemoteRel); + ExprContext *econtext = resultRemoteRel->ss.ps.ps_ExprContext; /* - * do_query calls get_exec_connections to determine target nodes - * at execution time. The function get_exec_connections can decide - * to evaluate en_expr to determine the target nodes. To evaluate en_expr, - * ExecEvalVar is called which picks up values from ecxt_scantuple if Var - * does not refer either OUTER or INNER varno. Hence we should copy the - * tuple returned by previous step in ecxt_scantuple if econtext is set. - * The econtext is set only when en_expr is set for execution time - * determination of the target nodes. + * Use data row returned by the previous step as a parameters for + * the main query. */ - if (econtext) - econtext->ecxt_scantuple = previousStepSlot; - - /* - * Consider the case of a non FQSed INSERT for example. The executor keeps - * track of # of tuples processed in es_processed member of EState structure. - * When a non-FQSed INSERT completes this member is increased once due to - * estate->es_processed += rowcount - * in HandleCommandComplete and once due to - * (estate->es_processed)++ - * in ExecInsert. The result is that although only one row is inserted we - * get message as if two rows got inserted INSERT 0 2. Now consider the - * same INSERT case when it is FQSed. In this case the # of tuples processed - * is increased just once in HandleCommandComplete since ExecInsert is never - * called in this case and hence we get correct output i.e. INSERT 0 1 - * To handle this error in processed tuple counting we use a variable - * non_fqs_dml which indicates whether this DML is FQSed or not. To indicate - * that this DML is not FQSed non_fqs_dml is set to true here and then if - * it is found true in HandleCommandComplete we skip handling of - * es_processed there and let ExecInsert do the processed tuple counting. - */ - resultRemoteRel->non_fqs_dml = true; - - /* - * This loop would be required to reject tuples received from datanodes - * when a DML with RETURNING is run on a replicated table otherwise it - * would run once. - * PGXC_TODO: This approach is error prone if the DML statement constructed - * by the planner is such that it updates more than one row (even in case of - * non-replicated data). Fix it. - */ - do + if (!TupIsNull(slot)) { - temp_slot = ExecProcNode((PlanState *)resultRemoteRel); - if (!TupIsNull(temp_slot)) - { - /* Have we already copied the returned tuple? */ - if (returningResultSlot == NULL) - { - /* Copy the received tuple to be returned later */ - returningResultSlot = MakeSingleTupleTableSlot(temp_slot->tts_tupleDescriptor); - returningResultSlot = ExecCopySlot(returningResultSlot, temp_slot); - } - /* Clear the received tuple, the copy required has already been saved */ - ExecClearTuple(temp_slot); - } - else - { - /* Null tuple received, so break the loop */ - ExecClearTuple(temp_slot); - break; - } - } while (dml_returning_on_replicated); + resultRemoteRel->paramval_len = ExecCopySlotDatarow(slot, + &resultRemoteRel->paramval_data); - /* - * A DML can impact more than one row, e.g. an update without any where - * clause on a table with more than one row. We need to make sure that - * RemoteQueryNext calls do_query for each affected row, hence we reset - * the flag here and finish the DML being executed only when we return - * NULL from ExecModifyTable - */ - resultRemoteRel->query_Done = false; - - return returningResultSlot; + /* + * The econtext is set only when en_expr is set for execution time + * evalulation of the target node. + */ + if (econtext) + econtext->ecxt_scantuple = slot; + do_query(resultRemoteRel); + } } + void RegisterTransactionNodes(int count, void **connections, bool write) { @@ -4277,6 +6341,7 @@ ForgetTransactionNodes(void) list_free(XactWriteNodes); XactWriteNodes = NIL; } +#endif /* * Clear per transaction remote information @@ -4284,11 +6349,57 @@ ForgetTransactionNodes(void) void AtEOXact_Remote(void) { +#ifdef XCP + PGXCNodeResetParams(true); +#else ExecClearTempObjectIncluded(); ForgetTransactionNodes(); clear_RemoteXactState(); +#endif } +#ifdef XCP +/* + * Invoked when local transaction is about to be committed. + * If nodestring is specified commit specified prepared transaction on remote + * nodes, otherwise commit remote nodes which are in transaction. + */ +void +PreCommit_Remote(char *prepareGID, char *nodestring, bool preparedLocalNode) +{ + /* + * Made node connections persistent if we are committing transaction + * that touched temporary tables. We never drop that flag, so after some + * transaction has created a temp table the session's remote connections + * become persistent. + * We do not need to set that flag if transaction that has created a temp + * table finally aborts - remote connections are not holding temporary + * objects in this case. + */ + if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && MyXactAccessedTempRel) + temp_object_included = true; + + + /* + * OK, everything went fine. At least one remote node is in PREPARED state + * and the transaction is successfully prepared on all the involved nodes. + * Now we are ready to commit the transaction. We need a new GXID to send + * down the remote nodes to execute the forthcoming COMMIT PREPARED + * command. So grab one from the GTM and track it. It will be closed along + * with the main transaction at the end. + */ + if (nodestring) + { + Assert(preparedLocalNode); + pgxc_node_remote_finish(prepareGID, true, nodestring, + GetAuxilliaryTransactionId(), + GetTopGlobalTransactionId()); + + } + else + pgxc_node_remote_commit(); +} +#else /* * Do pre-commit processing for remote nodes which includes Datanodes and * Coordinators. If more than one nodes are involved in the transaction write @@ -4354,6 +6465,7 @@ PreCommit_Remote(char *prepareGID, bool preparedLocalNode) if (!PersistentConnections) release_handles(); } +#endif /* * Do abort processing for the transaction. We must abort the transaction on @@ -4371,6 +6483,104 @@ PreCommit_Remote(char *prepareGID, bool preparedLocalNode) bool PreAbort_Remote(void) { +#ifdef XCP + /* + * We are about to abort current transaction, and there could be an + * unexpected error leaving the node connection in some state requiring + * clean up, like COPY or pending query results. + * If we are running copy we should send down CopyFail message and read + * all possible incoming messages, there could be copy rows (if running + * COPY TO) ErrorResponse, ReadyForQuery. + * If there are pending results (connection state is DN_CONNECTION_STATE_QUERY) + * we just need to read them in and discard, all necessary commands are + * already sent. The end of input could be CommandComplete or + * PortalSuspended, in either case subsequent ROLLBACK closes the portal. + */ + PGXCNodeAllHandles *all_handles; + PGXCNodeHandle *clean_nodes[NumCoords + NumDataNodes]; + int node_count = 0; + int i; + + all_handles = get_current_handles(); + /* + * Find "dirty" coordinator connections. + * COPY is never running on a coordinator connections, we just check for + * pending data. + */ + for (i = 0; i < all_handles->co_conn_count; i++) + { + PGXCNodeHandle *handle = all_handles->coord_handles[i]; + + if (handle->state == DN_CONNECTION_STATE_QUERY) + { + /* + * Forget previous combiner if any since input will be handled by + * different one. + */ + handle->combiner = NULL; + clean_nodes[node_count++] = handle; + } + } + + /* + * The same for data nodes, but cancel COPY if it is running. + */ + for (i = 0; i < all_handles->dn_conn_count; i++) + { + PGXCNodeHandle *handle = all_handles->datanode_handles[i]; + + if (handle->state == DN_CONNECTION_STATE_QUERY) + { + /* + * Forget previous combiner if any since input will be handled by + * different one. + */ + handle->combiner = NULL; + clean_nodes[node_count++] = handle; + } + else if (handle->state == DN_CONNECTION_STATE_COPY_IN || + handle->state == DN_CONNECTION_STATE_COPY_OUT) + { + DataNodeCopyEnd(handle, true); + clean_nodes[node_count++] = handle; + } + } + + pfree_pgxc_all_handles(all_handles); + + /* + * Now read and discard any data from the connections found "dirty" + */ + if (node_count > 0) + { + ResponseCombiner combiner; + + InitResponseCombiner(&combiner, node_count, COMBINE_TYPE_NONE); + /* + * Make sure there are zeroes in unused fields + */ + memset(&combiner, 0, sizeof(ScanState)); + combiner.connections = clean_nodes; + combiner.conn_count = node_count; + combiner.request_type = REQUEST_TYPE_ERROR; + + pgxc_connections_cleanup(&combiner); + + /* prevent pfree'ing local variable */ + combiner.connections = NULL; + + CloseCombiner(&combiner); + } + + pgxc_node_remote_abort(); + + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(); + release_handles(); + } +#else if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { cancel_query(); @@ -4423,13 +6633,62 @@ PreAbort_Remote(void) if (!PersistentConnections) release_handles(); +#endif return true; } + +/* + * Invoked when local transaction is about to be prepared. + * If invoked on a Datanode just commit transaction on remote connections, + * since secondary sessions are read only and never need to be prepared. + * Otherwise run PREPARE on remote connections, where writable commands were + * sent (connections marked as not read-only). + * If that is explicit PREPARE (issued by client) notify GTM. + * In case of implicit PREPARE not involving local node (ex. caused by + * INSERT, UPDATE or DELETE) commit prepared transaction immediately. + * Return list of node names where transaction was actually prepared, include + * the name of the local node if localNode is true. + */ char * PrePrepare_Remote(char *prepareGID, bool localNode, bool implicit) { +#ifdef XCP + /* Always include local node if running explicit prepare */ + char *nodestring; + + /* + * Primary session is doing 2PC, just commit secondary processes and exit + */ + if (IS_PGXC_DATANODE) + { + pgxc_node_remote_commit(); + return NULL; + } + + nodestring = pgxc_node_remote_prepare(prepareGID, + !implicit || localNode); + + if (!implicit && IS_PGXC_COORDINATOR && !IsConnFromCoord()) + /* Save the node list and gid on GTM. */ + StartPreparedTranGTM(GetTopGlobalTransactionId(), prepareGID, + nodestring); + + /* + * If no need to commit on local node go ahead and commit prepared + * transaction right away. + */ + if (implicit && !localNode && nodestring) + { + pgxc_node_remote_finish(prepareGID, true, nodestring, + GetAuxilliaryTransactionId(), + GetTopGlobalTransactionId()); + pfree(nodestring); + return NULL; + } + return nodestring; +#else init_RemoteXactState(false); /* * PREPARE the transaction on all nodes including remote nodes as well as @@ -4446,8 +6705,21 @@ PrePrepare_Remote(char *prepareGID, bool localNode, bool implicit) preparedNodes = pgxc_node_get_nodelist(true); return preparedNodes; +#endif } +#ifdef XCP +/* + * Invoked immediately after local node is prepared. + * Notify GTM about completed prepare. + */ +void +PostPrepare_Remote(char *prepareGID, bool implicit) +{ + if (!implicit) + PrepareTranGTM(GetTopGlobalTransactionId()); +} +#else void PostPrepare_Remote(char *prepareGID, char *nodestring, bool implicit) { @@ -4472,7 +6744,9 @@ PostPrepare_Remote(char *prepareGID, char *nodestring, bool implicit) /* Now forget the transaction nodes */ ForgetTransactionNodes(); } +#endif +#ifndef XCP /* * Return the list of nodes where the prepared transaction is not yet committed */ @@ -4522,10 +6796,65 @@ pgxc_node_get_nodelist(bool localNode) return nodestring; } +#endif + +#ifdef XCP +/* + * Returns true if 2PC is required for consistent commit: if there was write + * activity on two or more nodes within current transaction. + */ bool IsTwoPhaseCommitRequired(bool localWrite) { + PGXCNodeAllHandles *handles; + bool found = localWrite; + int i; + + /* Never run 2PC on Datanode-to-Datanode connection */ + if (IS_PGXC_DATANODE) + return false; + + if (MyXactAccessedTempRel) + { + elog(DEBUG1, "Transaction accessed temporary objects - " + "2PC will not be used and that can lead to data inconsistencies " + "in case of failures"); + return false; + } + + handles = get_current_handles(); + for (i = 0; i < handles->dn_conn_count; i++) + { + PGXCNodeHandle *conn = handles->datanode_handles[i]; + if (conn->sock != NO_SOCKET && !conn->read_only && + conn->transaction_status == 'T') + { + if (found) + return true; /* second found */ + else + found = true; /* first found */ + } + } + for (i = 0; i < handles->co_conn_count; i++) + { + PGXCNodeHandle *conn = handles->coord_handles[i]; + if (conn->sock != NO_SOCKET && !conn->read_only && + conn->transaction_status == 'T') + { + if (found) + return true; /* second found */ + else + found = true; /* first found */ + } + } + return false; +} +#else +bool +IsTwoPhaseCommitRequired(bool localWrite) +{ + if ((list_length(XactWriteNodes) > 1) || ((list_length(XactWriteNodes) == 1) && localWrite)) { @@ -4555,21 +6884,12 @@ clear_RemoteXactState(void) if ((remoteXactState.remoteNodeHandles == NULL) || (remoteXactState.maxRemoteNodes < (NumDataNodes + NumCoords))) { - if (!remoteXactState.remoteNodeHandles) - remoteXactState.remoteNodeHandles = (PGXCNodeHandle **) - malloc(sizeof(PGXCNodeHandle *) * (MaxDataNodes + MaxCoords)); - else - remoteXactState.remoteNodeHandles = (PGXCNodeHandle **) - realloc(remoteXactState.remoteNodeHandles, - sizeof(PGXCNodeHandle *) * (NumDataNodes + NumCoords)); - if (!remoteXactState.remoteNodeStatus) - remoteXactState.remoteNodeStatus = (RemoteXactNodeStatus *) - malloc(sizeof(RemoteXactNodeStatus) * (MaxDataNodes + MaxCoords)); - else - remoteXactState.remoteNodeStatus = (RemoteXactNodeStatus *) - realloc (remoteXactState.remoteNodeStatus, - sizeof(RemoteXactNodeStatus) * (NumDataNodes + NumCoords)); - + remoteXactState.remoteNodeHandles = (PGXCNodeHandle **) + realloc (remoteXactState.remoteNodeHandles, + sizeof (PGXCNodeHandle *) * (NumDataNodes + NumCoords)); + remoteXactState.remoteNodeStatus = (RemoteXactNodeStatus *) + realloc (remoteXactState.remoteNodeStatus, + sizeof (RemoteXactNodeStatus) * (NumDataNodes + NumCoords)); remoteXactState.maxRemoteNodes = NumDataNodes + NumCoords; } @@ -4606,16 +6926,30 @@ init_RemoteXactState(bool preparedLocalNode) remoteXactState.numReadRemoteNodes = read_conn_count; } +#endif + +/* + * Execute COMMIT/ABORT PREPARED issued by the remote client on remote nodes. + * Contacts GTM for the list of involved nodes and for work complete + * notification. Returns true if prepared transaction on local node needs to be + * finished too. + */ bool FinishRemotePreparedTransaction(char *prepareGID, bool commit) { +#ifdef XCP + char *nodestring; + GlobalTransactionId gxid, prepare_gxid; + bool prepared_local = false; +#else char *nodename, *nodestring; List *nodelist = NIL, *coordlist = NIL; GlobalTransactionId gxid, prepare_gxid; PGXCNodeAllHandles *pgxc_handles; bool prepared_local = false; int i; +#endif /* * Please note that with xc_maintenance_mode = on, COMMIT/ROLLBACK PREPARED will not @@ -4655,7 +6989,44 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit) (errcode(ERRCODE_INTERNAL_ERROR), errmsg("prepared transaction with identifier \"%s\" does not exist", prepareGID))); +#ifdef XCP + prepared_local = pgxc_node_remote_finish(prepareGID, commit, nodestring, + gxid, prepare_gxid); + if (commit) + { + CommitPreparedTranGTM(prepare_gxid, gxid); + } + else + { + RollbackTranGTM(prepare_gxid); + RollbackTranGTM(gxid); + } + + return prepared_local; +} + + +/* + * Complete previously prepared transactions on remote nodes. + * Release remote connection after completion. + */ +static bool +pgxc_node_remote_finish(char *prepareGID, bool commit, + char *nodestring, GlobalTransactionId gxid, + GlobalTransactionId prepare_gxid) +{ + char finish_cmd[256]; + PGXCNodeHandle *connections[MaxCoords + MaxDataNodes]; + int conn_count = 0; + ResponseCombiner combiner; + PGXCNodeAllHandles *pgxc_handles; + bool prepared_local = false; + char *nodename; + List *nodelist = NIL; + List *coordlist = NIL; + int i; +#endif /* * Now based on the nodestring, run COMMIT/ROLLBACK PREPARED command on the * remote nodes and also finish the transaction locally is required @@ -4663,6 +7034,19 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit) nodename = strtok(nodestring, ","); while (nodename != NULL) { +#ifdef XCP + int nodeIndex; + char nodetype; + + /* Get node type and index */ + nodetype = PGXC_NODE_NONE; + nodeIndex = PGXCNodeGetNodeIdFromName(nodename, &nodetype); + if (nodetype == PGXC_NODE_NONE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("PGXC Node %s: object not defined", + nodename))); +#else Oid nodeoid; int nodeIndex; char nodetype; @@ -4678,6 +7062,7 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit) /* Get node type and index */ nodetype = get_pgxc_nodetype(nodeoid); nodeIndex = PGXCNodeGetNodeId(nodeoid, get_pgxc_nodetype(nodeoid)); +#endif /* Check if node is requested is the self-node or not */ if (nodetype == PGXC_NODE_COORDINATOR) @@ -4693,6 +7078,104 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit) nodename = strtok(NULL, ","); } +#ifdef XCP + if (nodelist == NIL && coordlist == NIL) + return prepared_local; + + pgxc_handles = get_handles(nodelist, coordlist, false); + + if (commit) + sprintf(finish_cmd, "COMMIT PREPARED '%s'", prepareGID); + else + sprintf(finish_cmd, "ROLLBACK PREPARED '%s'", prepareGID); + + for (i = 0; i < pgxc_handles->dn_conn_count; i++) + { + PGXCNodeHandle *conn = pgxc_handles->datanode_handles[i]; + + if (pgxc_node_send_gxid(conn, gxid)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send GXID for %s PREPARED command", + commit ? "COMMIT" : "ROLLBACK"))); + } + + if (pgxc_node_send_query(conn, finish_cmd)) + { + /* + * Do not bother with clean up, just bomb out. The error handler + * will invoke RollbackTransaction which will do the work. + */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send %s PREPARED command to the node %u", + commit ? "COMMIT" : "ROLLBACK", conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + + for (i = 0; i < pgxc_handles->co_conn_count; i++) + { + PGXCNodeHandle *conn = pgxc_handles->coord_handles[i]; + + if (pgxc_node_send_gxid(conn, gxid)) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send GXID for %s PREPARED command", + commit ? "COMMIT" : "ROLLBACK"))); + } + + if (pgxc_node_send_query(conn, finish_cmd)) + { + /* + * Do not bother with clean up, just bomb out. The error handler + * will invoke RollbackTransaction which will do the work. + */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to send %s PREPARED command to the node %u", + commit ? "COMMIT" : "ROLLBACK", conn->nodeoid))); + } + else + { + /* Read responses from these */ + connections[conn_count++] = conn; + } + } + + if (conn_count) + { + InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE); + /* Receive responses */ + if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) || + !validate_combiner(&combiner)) + { + if (combiner.errorMessage) + pgxc_node_report_error(&combiner); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to COMMIT the transaction on one or more nodes"))); + } + else + CloseCombiner(&combiner); + } + + pfree_pgxc_all_handles(pgxc_handles); + + if (!temp_object_included && !PersistentConnections) + { + /* Clean up remote sessions */ + pgxc_node_remote_cleanup_all(); + release_handles(); + } +#else /* * Now get handles for all the involved Datanodes and the Coordinators */ @@ -4756,16 +7239,1800 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit) */ clear_RemoteXactState(); ForgetTransactionNodes(); +#endif return prepared_local; } + +#ifdef XCP +/***************************************************************************** + * + * Simplified versions of ExecInitRemoteQuery, ExecRemoteQuery and + * ExecEndRemoteQuery: in XCP they are only used to execute simple queries. + * + *****************************************************************************/ +RemoteQueryState * +ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) +{ + RemoteQueryState *remotestate; + ResponseCombiner *combiner; + + remotestate = makeNode(RemoteQueryState); + combiner = (ResponseCombiner *) remotestate; + InitResponseCombiner(combiner, 0, COMBINE_TYPE_NONE); + combiner->ss.ps.plan = (Plan *) node; + combiner->ss.ps.state = estate; + + combiner->ss.ps.qual = NIL; + + combiner->request_type = REQUEST_TYPE_QUERY; + + ExecInitResultTupleSlot(estate, &combiner->ss.ps); + if (node->scan.plan.targetlist) + ExecAssignResultTypeFromTL((PlanState *) remotestate); + + /* + * If there are parameters supplied, get them into a form to be sent to the + * datanodes with bind message. We should not have had done this before. + */ + if (estate->es_param_list_info) + { + Assert(!remotestate->paramval_data); + remotestate->paramval_len = ParamListToDataRow(estate->es_param_list_info, + &remotestate->paramval_data); + } + + /* We need expression context to evaluate */ + if (node->exec_nodes && node->exec_nodes->en_expr) + { + Expr *expr = node->exec_nodes->en_expr; + + if (IsA(expr, Var) && ((Var *) expr)->vartype == TIDOID) + { + /* Special case if expression does not need to be evaluated */ + } + else + { + /* prepare expression evaluation */ + ExecAssignExprContext(estate, &combiner->ss.ps); + } + } + + return remotestate; +} + + +/* + * Execute step of PGXC plan. + * The step specifies a command to be executed on specified nodes. + * On first invocation connections to the data nodes are initialized and + * command is executed. Further, as well as within subsequent invocations, + * responses are received until step is completed or there is a tuple to emit. + * If there is a tuple it is returned, otherwise returned NULL. The NULL result + * from the function indicates completed step. + * The function returns at most one tuple per invocation. + */ +TupleTableSlot * +ExecRemoteQuery(RemoteQueryState *node) +{ + ResponseCombiner *combiner = (ResponseCombiner *) node; + RemoteQuery *step = (RemoteQuery *) combiner->ss.ps.plan; + TupleTableSlot *resultslot = combiner->ss.ps.ps_ResultTupleSlot; + if (!node->query_Done) + { + GlobalTransactionId gxid = InvalidGlobalTransactionId; + Snapshot snapshot = GetActiveSnapshot(); + PGXCNodeHandle **connections = NULL; + PGXCNodeHandle *primaryconnection = NULL; + int i; + int regular_conn_count = 0; + int total_conn_count = 0; + bool need_tran_block; + PGXCNodeAllHandles *pgxc_connections; + + /* + * Get connections for Datanodes only, utilities and DDLs + * are launched in ExecRemoteUtility + */ + pgxc_connections = get_exec_connections(node, step->exec_nodes, + step->exec_type); + + if (step->exec_type == EXEC_ON_DATANODES) + { + connections = pgxc_connections->datanode_handles; + total_conn_count = regular_conn_count = pgxc_connections->dn_conn_count; + } + else if (step->exec_type == EXEC_ON_COORDS) + { + connections = pgxc_connections->coord_handles; + total_conn_count = regular_conn_count = pgxc_connections->co_conn_count; + } + + primaryconnection = pgxc_connections->primary_handle; + + /* + * Primary connection is counted separately but is included in total_conn_count if used. + */ + if (primaryconnection) + regular_conn_count--; + + pfree(pgxc_connections); + + /* + * We save only regular connections, at the time we exit the function + * we finish with the primary connection and deal only with regular + * connections on subsequent invocations + */ + combiner->node_count = regular_conn_count; + + /* + * Start transaction on data nodes if we are in explicit transaction + * or going to use extended query protocol or write to multiple nodes + */ + if (step->force_autocommit) + need_tran_block = false; + else + need_tran_block = step->cursor || + (!step->read_only && total_conn_count > 1) || + (TransactionBlockStatusCode() == 'T'); + + stat_statement(); + stat_transaction(total_conn_count); + + gxid = GetCurrentTransactionId(); + + if (!GlobalTransactionIdIsValid(gxid)) + { + if (primaryconnection) + pfree(primaryconnection); + pfree(connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to get next transaction ID"))); + } + + /* See if we have a primary node, execute on it first before the others */ + if (primaryconnection) + { + if (pgxc_node_begin(1, &primaryconnection, gxid, need_tran_block, + step->read_only, PGXC_NODE_DATANODE)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on data node."))); + + /* If explicit transaction is needed gxid is already sent */ + if (!pgxc_start_command_on_connection(primaryconnection, node, snapshot)) + { + pfree(connections); + pfree(primaryconnection); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + Assert(combiner->combine_type == COMBINE_TYPE_SAME); + + pgxc_node_receive(1, &primaryconnection, NULL); + /* Make sure the command is completed on the primary node */ + while (true) + { + int res = handle_response(primaryconnection, combiner); + if (res == RESPONSE_READY) + break; + else if (res == RESPONSE_EOF) + pgxc_node_receive(1, &primaryconnection, NULL); + else if (res == RESPONSE_COMPLETE || res == RESPONSE_ERROR) + /* Get ReadyForQuery */ + continue; + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from data node"))); + } + if (combiner->errorMessage) + { + char *code = combiner->errorCode; + if (combiner->errorDetail != NULL) + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", combiner->errorMessage), errdetail("%s", combiner->errorDetail) )); + else + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", combiner->errorMessage))); + } + } + + for (i = 0; i < regular_conn_count; i++) + { + if (pgxc_node_begin(1, &connections[i], gxid, need_tran_block, + step->read_only, PGXC_NODE_DATANODE)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on data node."))); + + /* If explicit transaction is needed gxid is already sent */ + if (!pgxc_start_command_on_connection(connections[i], node, snapshot)) + { + pfree(connections); + if (primaryconnection) + pfree(primaryconnection); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + connections[i]->combiner = combiner; + } + + if (step->cursor) + { + combiner->cursor = step->cursor; + combiner->cursor_count = regular_conn_count; + combiner->cursor_connections = (PGXCNodeHandle **) palloc(regular_conn_count * sizeof(PGXCNodeHandle *)); + memcpy(combiner->cursor_connections, connections, regular_conn_count * sizeof(PGXCNodeHandle *)); + } + + combiner->connections = connections; + combiner->conn_count = regular_conn_count; + combiner->current_conn = 0; + + if (combiner->cursor_count) + { + combiner->conn_count = combiner->cursor_count; + memcpy(connections, combiner->cursor_connections, + combiner->cursor_count * sizeof(PGXCNodeHandle *)); + combiner->connections = connections; + } + + node->query_Done = true; + + if (step->sort) + { + SimpleSort *sort = step->sort; + + /* + * First message is already in the buffer + * Further fetch will be under tuplesort control + * If query does not produce rows tuplesort will not + * be initialized + */ + combiner->tuplesortstate = tuplesort_begin_merge( + resultslot->tts_tupleDescriptor, + sort->numCols, + sort->sortColIdx, + sort->sortOperators, + sort->sortCollations, + sort->nullsFirst, + combiner, + work_mem); + } + } + + if (combiner->tuplesortstate) + { + if (tuplesort_gettupleslot((Tuplesortstate *) combiner->tuplesortstate, + true, resultslot)) + return resultslot; + else + ExecClearTuple(resultslot); + } + else + { + TupleTableSlot *slot = FetchTuple(combiner); + if (!TupIsNull(slot)) + return slot; + } + + if (combiner->errorMessage) + { + char *code = combiner->errorCode; + if (combiner->errorDetail != NULL) + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", combiner->errorMessage), errdetail("%s", combiner->errorDetail) )); + else + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", combiner->errorMessage))); + } + + return NULL; +} + + +/* + * Clean up and discard any data on the data node connections that might not + * handled yet, including pending on the remote connection. + */ +static void +pgxc_connections_cleanup(ResponseCombiner *combiner) +{ + /* clean up the buffer */ + list_free_deep(combiner->rowBuffer); + combiner->rowBuffer = NIL; + + /* + * Read in and discard remaining data from the connections, if any + */ + combiner->current_conn = 0; + while (combiner->conn_count > 0) + { + int res; + PGXCNodeHandle *conn = combiner->connections[combiner->current_conn]; + + /* + * Possible if we are doing merge sort. + * We can do usual procedure and move connections around since we are + * cleaning up and do not care what connection at what position + */ + if (conn == NULL) + { + REMOVE_CURR_CONN(combiner); + continue; + } + + /* throw away current message that may be in the buffer */ + if (combiner->currentRow) + { + pfree(combiner->currentRow); + combiner->currentRow = NULL; + } + + /* no data is expected */ + if (conn->state == DN_CONNECTION_STATE_IDLE || + conn->state == DN_CONNECTION_STATE_ERROR_FATAL) + { + REMOVE_CURR_CONN(combiner); + continue; + } + + /* + * Connection owner is different, so no our data pending at + * the connection, nothing to read in. + */ + if (conn->combiner && conn->combiner != combiner) + { + REMOVE_CURR_CONN(combiner); + continue; + } + + res = handle_response(conn, combiner); + if (res == RESPONSE_EOF) + { + struct timeval timeout; +#ifdef XCP + timeout.tv_sec = END_QUERY_TIMEOUT / 1000; + timeout.tv_usec = (END_QUERY_TIMEOUT % 1000) * 1000; +#else + timeout.tv_sec = END_QUERY_TIMEOUT; + timeout.tv_usec = 0; +#endif + + if (pgxc_node_receive(1, &conn, &timeout)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to read response from data nodes when ending query"))); + } + } + + /* + * Release tuplesort resources + */ + if (combiner->tuplesortstate) + { + /* + * Free these before tuplesort_end, because these arrays may appear + * in the tuplesort's memory context, tuplesort_end deletes this + * context and may invalidate the memory. + * We still want to free them here, because these may be in different + * context. + */ + if (combiner->tapenodes) + { + pfree(combiner->tapenodes); + combiner->tapenodes = NULL; + } + if (combiner->tapemarks) + { + pfree(combiner->tapemarks); + combiner->tapemarks = NULL; + } + /* + * tuplesort_end invalidates minimal tuple if it is in the slot because + * deletes the TupleSort memory context, causing seg fault later when + * releasing tuple table + */ + ExecClearTuple(combiner->ss.ps.ps_ResultTupleSlot); + tuplesort_end((Tuplesortstate *) combiner->tuplesortstate); + combiner->tuplesortstate = NULL; + } +} + + +/* + * End the remote query + */ +void +ExecEndRemoteQuery(RemoteQueryState *node) +{ + ResponseCombiner *combiner = (ResponseCombiner *) node; + + /* + * Clean up remote connections + */ + pgxc_connections_cleanup(combiner); + + /* + * Clean up parameters if they were set, since plan may be reused + */ + if (node->paramval_data) + { + pfree(node->paramval_data); + node->paramval_data = NULL; + node->paramval_len = 0; + } + + CloseCombiner(combiner); + pfree(node); +} + + +/********************************************** + * + * Routines to support RemoteSubplan plan node + * + **********************************************/ + + +/* + * The routine walks recursively over the plan tree and changes cursor names of + * RemoteSubplan nodes to make them different from launched from the other + * datanodes. The routine changes cursor names in place, so caller should + * take writable copy of the plan tree. + */ +void +RemoteSubplanMakeUnique(Node *plan, int unique) +{ + if (plan == NULL) + return; + + if (IsA(plan, List)) + { + ListCell *lc; + foreach(lc, (List *) plan) + { + RemoteSubplanMakeUnique(lfirst(lc), unique); + } + return; + } + + /* + * Transform SharedQueue name + */ + if (IsA(plan, RemoteSubplan)) + { + ((RemoteSubplan *)plan)->unique = unique; + } + /* Otherwise it is a Plan descendant */ + RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->initPlan, unique); + RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->lefttree, unique); + RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->righttree, unique); + /* Tranform special cases */ + switch (nodeTag(plan)) + { + case T_Append: + RemoteSubplanMakeUnique((Node *) ((Append *) plan)->appendplans, + unique); + break; + case T_MergeAppend: + RemoteSubplanMakeUnique((Node *) ((MergeAppend *) plan)->mergeplans, + unique); + break; + case T_BitmapAnd: + RemoteSubplanMakeUnique((Node *) ((BitmapAnd *) plan)->bitmapplans, + unique); + break; + case T_BitmapOr: + RemoteSubplanMakeUnique((Node *) ((BitmapOr *) plan)->bitmapplans, + unique); + break; + case T_SubqueryScan: + RemoteSubplanMakeUnique((Node *) ((SubqueryScan *) plan)->subplan, + unique); + break; + default: + break; + } +} + +struct find_params_context +{ + RemoteParam *rparams; + Bitmapset *defineParams; +}; + +static bool +determine_param_types_walker(Node *node, struct find_params_context *context) +{ + if (node == NULL) + return false; + + if (IsA(node, Param)) + { + Param *param = (Param *) node; + int paramno = param->paramid; + + if (param->paramkind == PARAM_EXEC && + bms_is_member(paramno, context->defineParams)) + { + RemoteParam *cur = context->rparams; + while (cur->paramkind != PARAM_EXEC || cur->paramid != paramno) + cur++; + cur->paramtype = param->paramtype; + context->defineParams = bms_del_member(context->defineParams, + paramno); + return bms_is_empty(context->defineParams); + } + } + return expression_tree_walker(node, determine_param_types_walker, + (void *) context); + +} + +/* + * Scan expressions in the plan tree to find Param nodes and get data types + * from them + */ +static bool +determine_param_types(Plan *plan, struct find_params_context *context) +{ + Bitmapset *intersect; + + if (plan == NULL) + return false; + + intersect = bms_intersect(plan->allParam, context->defineParams); + if (bms_is_empty(intersect)) + { + /* the subplan does not depend on params we are interested in */ + bms_free(intersect); + return false; + } + bms_free(intersect); + + /* scan target list */ + if (expression_tree_walker((Node *) plan->targetlist, + determine_param_types_walker, + (void *) context)) + return true; + /* scan qual */ + if (expression_tree_walker((Node *) plan->qual, + determine_param_types_walker, + (void *) context)) + return true; + + /* Check additional node-type-specific fields */ + switch (nodeTag(plan)) + { + case T_Result: + if (expression_tree_walker((Node *) ((Result *) plan)->resconstantqual, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_SeqScan: + break; + + case T_IndexScan: + if (expression_tree_walker((Node *) ((IndexScan *) plan)->indexqual, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_IndexOnlyScan: + if (expression_tree_walker((Node *) ((IndexOnlyScan *) plan)->indexqual, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_BitmapIndexScan: + if (expression_tree_walker((Node *) ((BitmapIndexScan *) plan)->indexqual, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_BitmapHeapScan: + if (expression_tree_walker((Node *) ((BitmapHeapScan *) plan)->bitmapqualorig, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_TidScan: + if (expression_tree_walker((Node *) ((TidScan *) plan)->tidquals, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_SubqueryScan: + if (determine_param_types(((SubqueryScan *) plan)->subplan, context)) + return true; + break; + + case T_FunctionScan: + if (expression_tree_walker((Node *) ((FunctionScan *) plan)->funcexpr, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_ValuesScan: + if (expression_tree_walker((Node *) ((ValuesScan *) plan)->values_lists, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_ModifyTable: + { + ListCell *l; + + foreach(l, ((ModifyTable *) plan)->plans) + { + if (determine_param_types((Plan *) lfirst(l), context)) + return true; + } + } + break; + + case T_RemoteSubplan: + break; + + case T_Append: + { + ListCell *l; + + foreach(l, ((Append *) plan)->appendplans) + { + if (determine_param_types((Plan *) lfirst(l), context)) + return true; + } + } + break; + + case T_BitmapAnd: + { + ListCell *l; + + foreach(l, ((BitmapAnd *) plan)->bitmapplans) + { + if (determine_param_types((Plan *) lfirst(l), context)) + return true; + } + } + break; + + case T_BitmapOr: + { + ListCell *l; + + foreach(l, ((BitmapOr *) plan)->bitmapplans) + { + if (determine_param_types((Plan *) lfirst(l), context)) + return true; + } + } + break; + + case T_NestLoop: + if (expression_tree_walker((Node *) ((Join *) plan)->joinqual, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_MergeJoin: + if (expression_tree_walker((Node *) ((Join *) plan)->joinqual, + determine_param_types_walker, + (void *) context)) + return true; + if (expression_tree_walker((Node *) ((MergeJoin *) plan)->mergeclauses, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_HashJoin: + if (expression_tree_walker((Node *) ((Join *) plan)->joinqual, + determine_param_types_walker, + (void *) context)) + return true; + if (expression_tree_walker((Node *) ((HashJoin *) plan)->hashclauses, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_Limit: + if (expression_tree_walker((Node *) ((Limit *) plan)->limitOffset, + determine_param_types_walker, + (void *) context)) + return true; + if (expression_tree_walker((Node *) ((Limit *) plan)->limitCount, + determine_param_types_walker, + (void *) context)) + return true; + break; + + case T_RecursiveUnion: + break; + + case T_LockRows: + break; + + case T_WindowAgg: + if (expression_tree_walker((Node *) ((WindowAgg *) plan)->startOffset, + determine_param_types_walker, + (void *) context)) + if (expression_tree_walker((Node *) ((WindowAgg *) plan)->endOffset, + determine_param_types_walker, + (void *) context)) + break; + + case T_Hash: + case T_Agg: + case T_Material: + case T_Sort: + case T_Unique: + case T_SetOp: + case T_Group: + break; + + default: + elog(ERROR, "unrecognized node type: %d", + (int) nodeTag(plan)); + } + + + /* recurse into subplans */ + return determine_param_types(plan->lefttree, context) || + determine_param_types(plan->righttree, context); +} + + +RemoteSubplanState * +ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags) +{ + RemoteStmt rstmt; + RemoteSubplanState *remotestate; + ResponseCombiner *combiner; + CombineType combineType; + + remotestate = makeNode(RemoteSubplanState); + combiner = (ResponseCombiner *) remotestate; + /* + * We do not need to combine row counts if we will receive intermediate + * results or if we won't return row count. + */ + if (IS_PGXC_DATANODE || estate->es_plannedstmt->commandType == CMD_SELECT) + { + combineType = COMBINE_TYPE_NONE; + remotestate->execOnAll = node->execOnAll; + } + else + { + if (node->execOnAll) + combineType = COMBINE_TYPE_SUM; + else + combineType = COMBINE_TYPE_SAME; + /* + * If we are updating replicated table we should run plan on all nodes. + * We are choosing single node only to read + */ + remotestate->execOnAll = true; + } + remotestate->execNodes = list_copy(node->nodeList); + InitResponseCombiner(combiner, 0, combineType); + combiner->ss.ps.plan = (Plan *) node; + combiner->ss.ps.state = estate; + + combiner->ss.ps.qual = NIL; + + combiner->request_type = REQUEST_TYPE_QUERY; + + ExecInitResultTupleSlot(estate, &combiner->ss.ps); + ExecAssignResultTypeFromTL((PlanState *) remotestate); + + /* + * We optimize execution if we going to send down query to next level + */ + remotestate->local_exec = false; + if (IS_PGXC_DATANODE) + { + if (remotestate->execNodes == NIL) + { + /* + * Special case, if subplan is not distributed, like Result, or + * query against catalog tables only. + * We are only interested in filtering out the subplan results and + * get only those we are interested in. + * XXX we may want to prevent multiple executions in this case + * either, to achieve this we will set single execNode on planning + * time and this case would never happen, this code branch could + * be removed. + */ + remotestate->local_exec = true; + } + else if (!remotestate->execOnAll) + { + /* + * XXX We should change planner and remove this flag. + * We want only one node is producing the replicated result set, + * and planner should choose that node - it is too hard to determine + * right node at execution time, because it should be guaranteed + * that all consumers make the same decision. + * For now always execute replicated plan on local node to save + * resources. + */ + + /* + * Make sure local node is in execution list + */ + if (list_member_int(remotestate->execNodes, PGXCNodeId-1)) + { + list_free(remotestate->execNodes); + remotestate->execNodes = NIL; + remotestate->local_exec = true; + } + else + { + /* + * To support, we need to connect to some producer, so + * each producer should be prepared to serve rows for random + * number of consumers. It is hard, because new consumer may + * connect after producing is started, on the other hand, + * absence of expected consumer is a problem too. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Getting replicated results from remote node is not supported"))); + } + } + } + + /* + * If we are going to execute subplan locally or doing explain initialize + * the subplan. Otherwise have remote node doing that. + */ + if (remotestate->local_exec || (eflags & EXEC_FLAG_EXPLAIN_ONLY)) + { + outerPlanState(remotestate) = ExecInitNode(outerPlan(node), estate, + eflags); + if (node->distributionNodes) + { + Oid distributionType = InvalidOid; + TupleDesc typeInfo; + + typeInfo = combiner->ss.ps.ps_ResultTupleSlot->tts_tupleDescriptor; + if (node->distributionKey != InvalidAttrNumber) + { + Form_pg_attribute attr; + attr = typeInfo->attrs[node->distributionKey - 1]; + distributionType = attr->atttypid; + } + /* Set up locator */ + remotestate->locator = createLocator(node->distributionType, + RELATION_ACCESS_INSERT, + distributionType, + LOCATOR_LIST_LIST, + 0, + (void *) node->distributionNodes, + (void **) &remotestate->dest_nodes, + false); + } + else + remotestate->locator = NULL; + } + + /* + * Encode subplan if it will be sent to remote nodes + */ + if (remotestate->execNodes && !(eflags & EXEC_FLAG_EXPLAIN_ONLY)) + { + ParamListInfo ext_params; + /* Encode plan if we are going to execute it on other nodes */ + rstmt.type = T_RemoteStmt; + if (node->distributionType == LOCATOR_TYPE_NONE && IS_PGXC_DATANODE) + { + /* + * There are cases when planner can not determine distribution of a + * subplan, in particular it does not determine distribution of + * subquery nodes. Such subplans executed from current location + * (node) and combine all results, like from coordinator nodes. + * However, if there are multiple locations where distributed + * executor is running this node, and there are more of + * RemoteSubplan plan nodes in the subtree there will be a problem - + * Instances of the inner RemoteSubplan nodes will be using the same + * SharedQueue, causing error. To avoid this problem we should + * traverse the subtree and change SharedQueue name to make it + * unique. + */ + RemoteSubplanMakeUnique((Node *) outerPlan(node), PGXCNodeId); + } + rstmt.planTree = outerPlan(node); + /* + * If datanode launch further execution of a command it should tell + * it is a SELECT, otherwise secondary data nodes won't return tuples + * expecting there will be nothing to return. + */ + if (IsA(outerPlan(node), ModifyTable)) + { + rstmt.commandType = estate->es_plannedstmt->commandType; + rstmt.hasReturning = estate->es_plannedstmt->hasReturning; + rstmt.resultRelations = estate->es_plannedstmt->resultRelations; + } + else + { + rstmt.commandType = CMD_SELECT; + rstmt.hasReturning = false; + rstmt.resultRelations = NIL; + } + rstmt.rtable = estate->es_range_table; + rstmt.subplans = estate->es_plannedstmt->subplans; + rstmt.nParamExec = estate->es_plannedstmt->nParamExec; + ext_params = estate->es_param_list_info; + rstmt.nParamRemote = (ext_params ? ext_params->numParams : 0) + + bms_num_members(node->scan.plan.allParam); + if (rstmt.nParamRemote > 0) + { + Bitmapset *tmpset; + int i; + int paramno; + + /* Allocate enough space */ + rstmt.remoteparams = (RemoteParam *) palloc(rstmt.nParamRemote * + sizeof(RemoteParam)); + paramno = 0; + if (ext_params) + { + for (i = 0; i < ext_params->numParams; i++) + { + ParamExternData *param = &ext_params->params[i]; + /* + * If parameter type is not yet defined but can be defined + * do that + */ + if (!OidIsValid(param->ptype) && ext_params->paramFetch) + (*ext_params->paramFetch) (ext_params, i + 1); + /* + * If parameter type is still not defined assume it is + * unused + */ + if (!OidIsValid(param->ptype)) + continue; + + rstmt.remoteparams[paramno].paramkind = PARAM_EXTERN; + rstmt.remoteparams[paramno].paramid = i + 1; + rstmt.remoteparams[paramno].paramtype = param->ptype; + paramno++; + } + /* store actual number of parameters */ + rstmt.nParamRemote = paramno; + } + + if (!bms_is_empty(node->scan.plan.allParam)) + { + Bitmapset *defineParams = NULL; + tmpset = bms_copy(node->scan.plan.allParam); + while ((i = bms_first_member(tmpset)) >= 0) + { + ParamExecData *prmdata; + + prmdata = &(estate->es_param_exec_vals[i]); + rstmt.remoteparams[paramno].paramkind = PARAM_EXEC; + rstmt.remoteparams[paramno].paramid = i; + rstmt.remoteparams[paramno].paramtype = prmdata->ptype; + /* Will scan plan tree to find out data type of the param */ + if (prmdata->ptype == InvalidOid) + defineParams = bms_add_member(defineParams, i); + paramno++; + } + /* store actual number of parameters */ + rstmt.nParamRemote = paramno; + bms_free(tmpset); + if (!bms_is_empty(defineParams)) + { + struct find_params_context context; + bool all_found; + + context.rparams = rstmt.remoteparams; + context.defineParams = defineParams; + + all_found = determine_param_types(node->scan.plan.lefttree, + &context); + /* + * Remove not defined params from the list of remote params. + * If they are not referenced no need to send them down + */ + if (!all_found) + { + for (i = 0; i < rstmt.nParamRemote; i++) + { + if (rstmt.remoteparams[i].paramkind == PARAM_EXEC && + bms_is_member(rstmt.remoteparams[i].paramid, + context.defineParams)) + { + /* Copy last parameter inplace */ + rstmt.nParamRemote--; + if (i < rstmt.nParamRemote) + rstmt.remoteparams[i] = + rstmt.remoteparams[rstmt.nParamRemote]; + /* keep current in the same position */ + i--; + } + } + } + bms_free(context.defineParams); + } + } + remotestate->nParamRemote = rstmt.nParamRemote; + remotestate->remoteparams = rstmt.remoteparams; + } + else + rstmt.remoteparams = NULL; + rstmt.rowMarks = estate->es_plannedstmt->rowMarks; + rstmt.distributionKey = node->distributionKey; + rstmt.distributionType = node->distributionType; + rstmt.distributionNodes = node->distributionNodes; + rstmt.distributionRestrict = node->distributionRestrict; + + set_portable_output(true); + remotestate->subplanstr = nodeToString(&rstmt); + set_portable_output(false); + + /* + * Connect to remote nodes and send down subplan + */ + if (!(eflags & EXEC_FLAG_SUBPLAN)) + ExecFinishInitRemoteSubplan(remotestate); + } + remotestate->bound = false; + /* + * It does not makes sense to merge sort if there is only one tuple source. + * By the contract it is already sorted + */ + if (node->sort && remotestate->execOnAll && + list_length(remotestate->execNodes) > 1) + combiner->merge_sort = true; + + return remotestate; +} + + +void +ExecFinishInitRemoteSubplan(RemoteSubplanState *node) +{ + ResponseCombiner *combiner = (ResponseCombiner *) node; + RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan; + Oid *paramtypes = NULL; + GlobalTransactionId gxid = InvalidGlobalTransactionId; + Snapshot snapshot; + TimestampTz timestamp; + int i; + bool is_read_only; + char cursor[NAMEDATALEN]; + + /* + * Name is required to store plan as a statement + */ + Assert(plan->cursor); + + if (plan->unique) + snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique); + else + strncpy(cursor, plan->cursor, NAMEDATALEN); + + /* If it is alreaty fully initialized nothing to do */ + if (combiner->connections) + return; + + /* local only or explain only execution */ + if (node->subplanstr == NULL) + return; + + /* + * Acquire connections and send down subplan where it will be stored + * as a prepared statement. + * That does not require transaction id or snapshot, so does not send them + * here, postpone till bind. + */ + if (node->execOnAll) + { + PGXCNodeAllHandles *pgxc_connections; + pgxc_connections = get_handles(node->execNodes, NIL, false); + combiner->conn_count = pgxc_connections->dn_conn_count; + combiner->connections = pgxc_connections->datanode_handles; + combiner->current_conn = 0; + pfree(pgxc_connections); + } + else + { + combiner->connections = (PGXCNodeHandle **) palloc(sizeof(PGXCNodeHandle *)); + combiner->connections[0] = get_any_handle(node->execNodes); + combiner->conn_count = 1; + combiner->current_conn = 0; + } + + gxid = GetCurrentTransactionId(); + if (!GlobalTransactionIdIsValid(gxid)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to get next transaction ID"))); + } + + /* extract parameter data types */ + if (node->nParamRemote > 0) + { + paramtypes = (Oid *) palloc(node->nParamRemote * sizeof(Oid)); + for (i = 0; i < node->nParamRemote; i++) + paramtypes[i] = node->remoteparams[i].paramtype; + } + /* send down subplan */ + snapshot = GetActiveSnapshot(); + timestamp = GetCurrentGTMStartTimestamp(); + /* + * Datanode should not send down statements that may modify + * the database. Potgres assumes that all sessions under the same + * postmaster have different xids. That may cause a locking problem. + * Shared locks acquired for reading still work fine. + */ + is_read_only = IS_PGXC_DATANODE || + !IsA(outerPlan(plan), ModifyTable); + + for (i = 0; i < combiner->conn_count; i++) + { + PGXCNodeHandle *connection = combiner->connections[i]; + + if (pgxc_node_begin(1, &connection, gxid, true, + is_read_only, PGXC_NODE_DATANODE)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on data node."))); + + if (pgxc_node_send_timestamp(connection, timestamp)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + if (snapshot && pgxc_node_send_snapshot(connection, snapshot)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + pgxc_node_send_plan(connection, cursor, "Remote Subplan", + node->subplanstr, node->nParamRemote, paramtypes); + if (pgxc_node_flush(connection)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send subplan to data nodes"))); + } + } +} + + +static void +append_param_data(StringInfo buf, Oid ptype, Datum value, bool isnull) +{ + uint32 n32; + + if (isnull) + { + n32 = htonl(-1); + appendBinaryStringInfo(buf, (char *) &n32, 4); + } + else + { + Oid typOutput; + bool typIsVarlena; + Datum pval; + char *pstring; + int len; + + /* Get info needed to output the value */ + getTypeOutputInfo(ptype, &typOutput, &typIsVarlena); + + /* + * If we have a toasted datum, forcibly detoast it here to avoid + * memory leakage inside the type's output routine. + */ + if (typIsVarlena) + pval = PointerGetDatum(PG_DETOAST_DATUM(value)); + else + pval = value; + + /* Convert Datum to string */ + pstring = OidOutputFunctionCall(typOutput, pval); + + /* copy data to the buffer */ + len = strlen(pstring); + n32 = htonl(len); + appendBinaryStringInfo(buf, (char *) &n32, 4); + appendBinaryStringInfo(buf, pstring, len); + } +} + + +static int encode_parameters(int nparams, RemoteParam *remoteparams, + PlanState *planstate, char** result) +{ + EState *estate = planstate->state; + StringInfoData buf; + uint16 n16; + int i; + ExprContext *econtext; + MemoryContext oldcontext; + + if (planstate->ps_ExprContext == NULL) + ExecAssignExprContext(estate, planstate); + + econtext = planstate->ps_ExprContext; + oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + MemoryContextReset(econtext->ecxt_per_tuple_memory); + + initStringInfo(&buf); + + /* Number of parameter values */ + n16 = htons(nparams); + appendBinaryStringInfo(&buf, (char *) &n16, 2); + + /* Parameter values */ + for (i = 0; i < nparams; i++) + { + RemoteParam *rparam = &remoteparams[i]; + int ptype = rparam->paramtype; + if (rparam->paramkind == PARAM_EXTERN) + { + ParamExternData *param; + param = &(estate->es_param_list_info->params[rparam->paramid - 1]); + append_param_data(&buf, ptype, param->value, param->isnull); + } + else + { + ParamExecData *param; + param = &(estate->es_param_exec_vals[rparam->paramid]); + if (param->execPlan) + { + /* Parameter not evaluated yet, so go do it */ + ExecSetParamPlan((SubPlanState *) param->execPlan, + planstate->ps_ExprContext); + /* ExecSetParamPlan should have processed this param... */ + Assert(param->execPlan == NULL); + } + append_param_data(&buf, ptype, param->value, param->isnull); + } + } + + /* Take data from the buffer */ + *result = palloc(buf.len); + memcpy(*result, buf.data, buf.len); + MemoryContextSwitchTo(oldcontext); + return buf.len; +} + + +TupleTableSlot * +ExecRemoteSubplan(RemoteSubplanState *node) +{ + ResponseCombiner *combiner = (ResponseCombiner *) node; + RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan; + EState *estate = combiner->ss.ps.state; + TupleTableSlot *resultslot = combiner->ss.ps.ps_ResultTupleSlot; + +primary_mode_phase_two: + if (!node->bound) + { + int fetch = 0; + int paramlen = 0; + char *paramdata = NULL; + /* + * Conditions when we want to execute query on the primary node first: + * Coordinator running replicated ModifyTable on multiple nodes + */ + bool primary_mode = combiner->probing_primary || + (IS_PGXC_COORDINATOR && + combiner->combine_type == COMBINE_TYPE_SAME && + OidIsValid(primary_data_node) && + combiner->conn_count > 1); + char cursor[NAMEDATALEN]; + + if (plan->cursor) + { + fetch = 1000; + if (plan->unique) + snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique); + else + strncpy(cursor, plan->cursor, NAMEDATALEN); + } + else + cursor[0] = '\0'; + + /* + * Send down all available parameters, if any is used by the plan + */ + if (estate->es_param_list_info || + !bms_is_empty(plan->scan.plan.allParam)) + paramlen = encode_parameters(node->nParamRemote, + node->remoteparams, + &combiner->ss.ps, + ¶mdata); + + /* + * The subplan being rescanned, need to restore connections and + * re-bind the portal + */ + if (combiner->cursor) + { + int i; + + /* + * On second phase of primary mode connections are properly set, + * so do not copy. + */ + if (!combiner->probing_primary) + { + combiner->conn_count = combiner->cursor_count; + memcpy(combiner->connections, combiner->cursor_connections, + combiner->cursor_count * sizeof(PGXCNodeHandle *)); + } + + for (i = 0; i < combiner->conn_count; i++) + { + PGXCNodeHandle *conn = combiner->connections[i]; + + CHECK_OWNERSHIP(conn, combiner); + + /* close previous cursor only on phase 1 */ + if (!primary_mode || !combiner->probing_primary) + pgxc_node_send_close(conn, false, combiner->cursor); + + /* + * If we now should probe primary, skip execution on non-primary + * nodes + */ + if (primary_mode && !combiner->probing_primary && + conn->nodeoid != primary_data_node) + continue; + + /* rebind */ + pgxc_node_send_bind(conn, combiner->cursor, combiner->cursor, + paramlen, paramdata); + /* execute */ + pgxc_node_send_execute(conn, combiner->cursor, fetch); + /* submit */ + if (pgxc_node_send_flush(conn)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + + /* + * There could be only one primary node, but can not leave the + * loop now, because we need to close cursors. + */ + if (primary_mode && !combiner->probing_primary) + { + combiner->current_conn = i; + } + } + } + else if (node->execNodes) + { + CommandId cid; + int i; + + /* + * There are prepared statement, connections should be already here + */ + Assert(combiner->conn_count > 0); + + combiner->extended_query = true; + cid = estate->es_snapshot->curcid; + + for (i = 0; i < combiner->conn_count; i++) + { + PGXCNodeHandle *conn = combiner->connections[i]; + + CHECK_OWNERSHIP(conn, combiner); + + /* + * If we now should probe primary, skip execution on non-primary + * nodes + */ + if (primary_mode && !combiner->probing_primary && + conn->nodeoid != primary_data_node) + continue; + + /* + * Update Command Id. Other command may be executed after we + * prepare and advanced Command Id. We should use one that + * was active at the moment when command started. + */ + if (pgxc_node_send_cmd_id(conn, cid)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + + /* bind */ + pgxc_node_send_bind(conn, cursor, cursor, paramlen, paramdata); + /* execute */ + pgxc_node_send_execute(conn, cursor, fetch); + /* submit */ + if (pgxc_node_send_flush(conn)) + { + combiner->conn_count = 0; + pfree(combiner->connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + + /* + * There could be only one primary node, so if we executed + * subquery on the phase one of primary mode we can leave the + * loop now. + */ + if (primary_mode && !combiner->probing_primary) + { + combiner->current_conn = i; + break; + } + } + + /* + * On second phase of primary mode connections are backed up + * already, so do not copy. + */ + if (primary_mode) + { + if (combiner->probing_primary) + { + combiner->cursor = pstrdup(cursor); + } + else + { + combiner->cursor_count = combiner->conn_count; + combiner->cursor_connections = (PGXCNodeHandle **) palloc( + combiner->conn_count * sizeof(PGXCNodeHandle *)); + memcpy(combiner->cursor_connections, combiner->connections, + combiner->conn_count * sizeof(PGXCNodeHandle *)); + } + } + else + { + combiner->cursor = pstrdup(cursor); + combiner->cursor_count = combiner->conn_count; + combiner->cursor_connections = (PGXCNodeHandle **) palloc( + combiner->conn_count * sizeof(PGXCNodeHandle *)); + memcpy(combiner->cursor_connections, combiner->connections, + combiner->conn_count * sizeof(PGXCNodeHandle *)); + } + } + + if (combiner->merge_sort) + { + /* + * Requests are already made and sorter can fetch tuples to populate + * sort buffer. + */ + combiner->tuplesortstate = tuplesort_begin_merge( + resultslot->tts_tupleDescriptor, + plan->sort->numCols, + plan->sort->sortColIdx, + plan->sort->sortOperators, + plan->sort->sortCollations, + plan->sort->nullsFirst, + combiner, + work_mem); + } + if (primary_mode) + { + if (combiner->probing_primary) + { + combiner->probing_primary = false; + node->bound = true; + } + else + combiner->probing_primary = true; + } + else + node->bound = true; + } + + if (combiner->tuplesortstate) + { + if (tuplesort_gettupleslot((Tuplesortstate *) combiner->tuplesortstate, + true, resultslot)) + return resultslot; + } + else + { + TupleTableSlot *slot = FetchTuple(combiner); + if (!TupIsNull(slot)) + return slot; + else if (combiner->probing_primary) + /* phase1 is successfully completed, run on other nodes */ + goto primary_mode_phase_two; + } + if (combiner->errorMessage) + { + char *code = combiner->errorCode; + if (combiner->errorDetail) + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", combiner->errorMessage), errdetail("%s", combiner->errorDetail) )); + else + ereport(ERROR, + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", combiner->errorMessage))); + } + return NULL; +} + + +void +ExecReScanRemoteSubplan(RemoteSubplanState *node) +{ + ResponseCombiner *combiner = (ResponseCombiner *)node; + + /* + * If we haven't queried remote nodes yet, just return. If outerplan' + * chgParam is not NULL then it will be re-scanned by ExecProcNode, + * else - no reason to re-scan it at all. + */ + if (!node->bound) + return; + + /* + * If we execute locally rescan local copy of the plan + */ + if (outerPlanState(node)) + ExecReScan(outerPlanState(node)); + + /* + * Consume any possible pending input + */ + pgxc_connections_cleanup(combiner); + + /* misc cleanup */ + combiner->command_complete_count = 0; + combiner->description_count = 0; + + /* + * Force query is re-bound with new parameters + */ + node->bound = false; +} + + +void +ExecEndRemoteSubplan(RemoteSubplanState *node) +{ + ResponseCombiner *combiner = (ResponseCombiner *)node; + RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan; + int i; + + if (outerPlanState(node)) + ExecEndNode(outerPlanState(node)); + if (node->locator) + freeLocator(node->locator); + + /* + * Consume any possible pending input + */ + if (node->bound) + pgxc_connections_cleanup(combiner); + + /* + * Update coordinator statistics + */ + if (IS_PGXC_COORDINATOR) + { + EState *estate = combiner->ss.ps.state; + + if (estate->es_num_result_relations > 0 && estate->es_processed > 0) + { + switch (estate->es_plannedstmt->commandType) + { + case CMD_INSERT: + /* One statement can insert into only one relation */ + pgstat_count_remote_insert( + estate->es_result_relations[0].ri_RelationDesc, + estate->es_processed); + break; + case CMD_UPDATE: + case CMD_DELETE: + { + /* + * We can not determine here how many row were updated + * or delete in each table, so assume same number of + * affected row in each table. + * If resulting number of rows is 0 because of rounding, + * increment each counter at least on 1. + */ + int i; + int n; + bool update; + + update = (estate->es_plannedstmt->commandType == CMD_UPDATE); + n = estate->es_processed / estate->es_num_result_relations; + if (n == 0) + n = 1; + for (i = 0; i < estate->es_num_result_relations; i++) + { + Relation r; + r = estate->es_result_relations[i].ri_RelationDesc; + if (update) + pgstat_count_remote_update(r, n); + else + pgstat_count_remote_delete(r, n); + } + } + break; + default: + /* nothing to count */ + break; + } + } + } + + /* + * Close portals. While cursors_connections exist there are open portals + */ + if (combiner->cursor) + { + /* Restore connections where there are active statements */ + combiner->conn_count = combiner->cursor_count; + memcpy(combiner->connections, combiner->cursor_connections, + combiner->cursor_count * sizeof(PGXCNodeHandle *)); + for (i = 0; i < combiner->cursor_count; i++) + { + PGXCNodeHandle *conn; + + conn = combiner->cursor_connections[i]; + + CHECK_OWNERSHIP(conn, combiner); + + if (pgxc_node_send_close(conn, false, combiner->cursor) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to close data node cursor"))); + } + /* The cursor stuff is not needed */ + combiner->cursor = NULL; + combiner->cursor_count = 0; + pfree(combiner->cursor_connections); + combiner->cursor_connections = NULL; + } + + /* Close statements, even if they never were bound */ + for (i = 0; i < combiner->conn_count; i++) + { + PGXCNodeHandle *conn; + char cursor[NAMEDATALEN]; + + if (plan->cursor) + { + if (plan->unique) + snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique); + else + strncpy(cursor, plan->cursor, NAMEDATALEN); + } + else + cursor[0] = '\0'; + + conn = combiner->connections[i]; + + CHECK_OWNERSHIP(conn, combiner); + + if (pgxc_node_send_close(conn, true, cursor) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to close data node statement"))); + /* Send SYNC and wait for ReadyForQuery */ + if (pgxc_node_send_sync(conn) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to synchronize data node"))); + /* + * Formally connection is not in QUERY state, we set the state to read + * CloseDone and ReadyForQuery responses. Upon receiving ReadyForQuery + * state will be changed back to IDLE and conn->coordinator will be + * cleared. + */ + conn->state = DN_CONNECTION_STATE_CLOSE; + } + + while (combiner->conn_count > 0) + { + if (pgxc_node_receive(combiner->conn_count, + combiner->connections, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to close remote subplan"))); + i = 0; + while (i < combiner->conn_count) + { + int res = handle_response(combiner->connections[i], combiner); + if (res == RESPONSE_EOF) + { + i++; + } + else if (res == RESPONSE_READY) + { + /* Done, connection is reade for query */ + if (--combiner->conn_count > i) + combiner->connections[i] = + combiner->connections[combiner->conn_count]; + } + else if (res == RESPONSE_DATAROW) + { + /* + * If we are finishing slowly running remote subplan while it + * is still working (because of Limit, for example) it may + * produce one or more tuples between connection cleanup and + * handling Close command. One tuple does not cause any problem, + * but if it will not be read the next tuple will trigger + * assertion failure. So if we got a tuple, just read and + * discard it here. + */ + pfree(combiner->currentRow); + combiner->currentRow = NULL; + } + /* Ignore other possible responses */ + } + } + + ValidateAndCloseCombiner(combiner); + pfree(node); +} +#endif + + /* * pgxc_node_report_error * Throw error from Datanode if any. */ +#ifdef XCP +static void +pgxc_node_report_error(ResponseCombiner *combiner) +#else static void pgxc_node_report_error(RemoteQueryState *combiner) +#endif { /* If no combiner, nothing to do */ if (!combiner) @@ -4885,231 +9152,3 @@ void AtEOXact_DBCleanup(bool isCommit) dbcleanup_info.fparams = NULL; } } - -static TupleTableSlot * -getrow_for_tapesort(RemoteQueryState *combiner, TupleTableSlot *scanslot) -{ - int tapenum = combiner->rqs_tapenum; - PGXCNodeHandle *conn = combiner->connections[tapenum]; - /* - * If connection is active (potentially has data to read) we can get node - * number from the connection. If connection is not active (we have read all - * available data rows) and if we have buffered data from that connection - * the node number is stored in combiner->tapenodes[tapenum]. - * If connection is inactive and no buffered data we have EOF condition - */ - int nid; - ListCell *lc; - ListCell *prev = NULL; - - /* May it ever happen ?! */ - if (!conn && !combiner->tapenodes) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to fetch from data node cursor"))); - - nid = conn ? PGXCNodeGetNodeId(conn->nodeoid, PGXC_NODE_DATANODE) : combiner->tapenodes[tapenum]; - - if (nid < 0) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Node id %d is incorrect", nid))); - - /* - * If there are buffered rows iterate over them and get first from - * the requested tape - */ - foreach (lc, combiner->rowBuffer) - { - RemoteDataRow dataRow = (RemoteDataRow) lfirst(lc); - if (dataRow->msgnode == nid) - { - combiner->currentRow = *dataRow; - combiner->rowBuffer = list_delete_cell(combiner->rowBuffer, lc, prev); - CopyDataRowTupleToSlot(combiner, scanslot); - return scanslot; - } - prev = lc; - } - - /* Nothing is found in the buffer, check for EOF */ - if (conn == NULL) - { - ExecClearTuple(scanslot); - return scanslot; - } - - /* The connection is executing a query but not for this RemoteQueryState. - * Before sending the query, it must have buffered the rows for the query of - * this RemoteQueryState, which we have consumed already. So nothing do - * here. Just return a NULL tuple and mark the connection as done - */ - if (conn->state == DN_CONNECTION_STATE_QUERY && conn->combiner != combiner) - { - combiner->connections[tapenum] = NULL; - ExecClearTuple(scanslot); - return scanslot; - } - - /* Read data from the connection until get a row or EOF */ - for (;;) - { - switch (handle_response(conn, combiner)) - { - case RESPONSE_SUSPENDED: - /* Send Execute to request next row */ - Assert(combiner->cursor); - if (pgxc_node_send_execute(conn, combiner->cursor, 1) != 0) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to fetch from data node cursor"))); - if (pgxc_node_send_sync(conn) != 0) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to fetch from data node cursor"))); - conn->state = DN_CONNECTION_STATE_QUERY; - conn->combiner = combiner; - /* fallthru */ - case RESPONSE_EOF: - /* receive more data */ - if (pgxc_node_receive(1, &conn, NULL)) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("%s", conn->error))); - break; - - case RESPONSE_COMPLETE: - combiner->connections[tapenum] = NULL; - ExecClearTuple(scanslot); - return scanslot; - break; - - case RESPONSE_DATAROW: - CopyDataRowTupleToSlot(combiner, scanslot); - return scanslot; - break; - - default: - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Unexpected response from the data nodes"))); - } - } - - /* - * Didn't get any row and also didn't get a RESPONSE_COMPLETE (otherwise we - * would have returned from there with this tape nullified). This should - * never happen. Throw an error. - */ - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Did not get response complete message for the connection."))); -} - - -/* -------------------------------- - * SetDataRowForIntParams: Form a BIND data row for internal parameters. - * This function is called when the data for the parameters of remote - * statement resides in some plan slot of an internally generated remote - * statement rather than from some extern params supplied by the caller of the - * query. Currently DML is the only case where we generate a query with - * internal parameters. - * The parameter data is constructed from the slot data, and stored in - * RemoteQueryState.paramval_data. - * At the same time, remote parameter types are inferred from the slot - * tuple descriptor, and stored in RemoteQueryState.rqs_param_types. - * On subsequent calls, these param types are re-used. - * The slot itself is undisturbed. - * -------------------------------- - */ -static void -SetDataRowForIntParams(TupleTableSlot *slot, RemoteQueryState *rq_state) -{ - TupleDesc tdesc = slot->tts_tupleDescriptor; - int att_index; - - Assert(tdesc != NULL); - - /* - * Infer param types from the tuple desc. But we have to do it only the - * first time: the interal parameters remain the same while processing all - * the source data rows because the data slot tupdesc never changes. - * Even though we can determine the internal param types during planning, we - * want to do it here: we don't want to set the param types and param data - * at two different places. Doing them together here helps us to make sure - * that the order of param types are in line with the order of the param - * data. - */ - if (rq_state->rqs_num_params == 0) - { - rq_state->rqs_num_params = tdesc->natts; - rq_state->rqs_param_types = - (Oid *) palloc(sizeof(Oid) * rq_state->rqs_num_params); - for (att_index = 0; att_index < rq_state->rqs_num_params; att_index++) - rq_state->rqs_param_types[att_index] = tdesc->attrs[att_index]->atttypid; - } - - /* if we already have datarow make a copy */ - if (slot->tts_dataRow) - { - rq_state->paramval_data = (char *)palloc(slot->tts_dataLen); - memcpy(rq_state->paramval_data, slot->tts_dataRow, slot->tts_dataLen); - rq_state->paramval_len = slot->tts_dataLen; - } - else - { - StringInfoData buf; - uint16 n16; - - initStringInfo(&buf); - /* Number of parameter values */ - n16 = htons(tdesc->natts); - appendBinaryStringInfo(&buf, (char *) &n16, 2); - - /* ensure we have all values */ - slot_getallattrs(slot); - for (att_index = 0; att_index < tdesc->natts; att_index++) - { - uint32 n32; - - if (slot->tts_isnull[att_index]) - { - n32 = htonl(-1); - appendBinaryStringInfo(&buf, (char *) &n32, 4); - } - else - { - Form_pg_attribute attr = tdesc->attrs[att_index]; - Oid typOutput; - bool typIsVarlena; - Datum pval; - char *pstring; - int len; - - /* Get info needed to output the value */ - getTypeOutputInfo(attr->atttypid, &typOutput, &typIsVarlena); - /* - * If we have a toasted datum, forcibly detoast it here to avoid - * memory leakage inside the type's output routine. - */ - if (typIsVarlena) - pval = PointerGetDatum(PG_DETOAST_DATUM(slot->tts_values[att_index])); - else - pval = slot->tts_values[att_index]; - - /* Convert Datum to string */ - pstring = OidOutputFunctionCall(typOutput, pval); - - /* copy data to the buffer */ - len = strlen(pstring); - n32 = htonl(len); - appendBinaryStringInfo(&buf, (char *) &n32, 4); - appendBinaryStringInfo(&buf, pstring, len); - } - } - - /* Assign the newly allocated data row to paramval */ - rq_state->paramval_data = buf.data; - rq_state->paramval_len = buf.len; - } -} diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index 2dd460efd6..0431a2257b 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -6,6 +6,11 @@ * Datanodes and Coordinators * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -50,6 +55,12 @@ #include "utils/lsyscache.h" #include "utils/formatting.h" #include "../interfaces/libpq/libpq-fe.h" +#ifdef XCP +#include "miscadmin.h" +#include "storage/ipc.h" +#include "pgxc/pause.h" +#include "utils/snapmgr.h" +#endif #define CMD_ID_MSG_LEN 8 @@ -75,6 +86,30 @@ static PGXCNodeHandle *co_handles = NULL; int NumDataNodes; int NumCoords; + +#ifdef XCP +volatile bool HandlesInvalidatePending = false; + +/* + * Session and transaction parameters need to to be set on newly connected + * remote nodes. + */ +static HTAB *session_param_htab = NULL; +static HTAB *local_param_htab = NULL; +static StringInfo session_params; +static StringInfo local_params; + +typedef struct +{ + NameData name; + NameData value; +} ParamEntry; + + +static bool DoInvalidateRemoteHandles(void); +#endif + + static void pgxc_node_init(PGXCNodeHandle *handle, int sock); static void pgxc_node_free(PGXCNodeHandle *handle); static void pgxc_node_all_free(void); @@ -100,6 +135,7 @@ init_pgxc_handle(PGXCNodeHandle *pgxc_handle) pgxc_handle->outSize = 16 * 1024; pgxc_handle->outBuffer = (char *) palloc(pgxc_handle->outSize); pgxc_handle->inSize = 16 * 1024; + pgxc_handle->inBuffer = (char *) palloc(pgxc_handle->inSize); pgxc_handle->combiner = NULL; pgxc_handle->inStart = 0; @@ -124,6 +160,10 @@ InitMultinodeExecutor(bool is_force) { int count; Oid *coOids, *dnOids; +#ifdef XCP + MemoryContext oldcontext; +#endif + /* Free all the existing information first */ if (is_force) @@ -140,6 +180,14 @@ InitMultinodeExecutor(bool is_force) /* Get classified list of node Oids */ PgxcNodeGetOids(&coOids, &dnOids, &NumCoords, &NumDataNodes, true); +#ifdef XCP + /* + * Coordinator and datanode handles should be available during all the + * session lifetime + */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); +#endif + /* Do proper initialization of handles */ if (NumDataNodes > 0) dn_handles = (PGXCNodeHandle *) @@ -170,6 +218,28 @@ InitMultinodeExecutor(bool is_force) coord_count = 0; PGXCNodeId = 0; +#ifdef XCP + MemoryContextSwitchTo(oldcontext); + + if (IS_PGXC_COORDINATOR) + { + for (count = 0; count < NumCoords; count++) + { + if (pg_strcasecmp(PGXCNodeName, + get_pgxc_nodename(co_handles[count].nodeoid)) == 0) + PGXCNodeId = count + 1; + } + } + else /* DataNode */ + { + for (count = 0; count < NumDataNodes; count++) + { + if (pg_strcasecmp(PGXCNodeName, + get_pgxc_nodename(dn_handles[count].nodeoid)) == 0) + PGXCNodeId = count + 1; + } + } +#else /* Finally determine which is the node-self */ for (count = 0; count < NumCoords; count++) { @@ -186,6 +256,7 @@ InitMultinodeExecutor(bool is_force) ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("Coordinator cannot identify itself"))); +#endif } @@ -193,8 +264,13 @@ InitMultinodeExecutor(bool is_force) * Builds up a connection string */ char * +#ifdef XCP +PGXCNodeConnStr(char *host, int port, char *dbname, + char *user, char *remote_type, char *parent_node) +#else PGXCNodeConnStr(char *host, int port, char *dbname, char *user, char *pgoptions, char *remote_type) +#endif { char *out, connstr[256]; @@ -204,9 +280,15 @@ PGXCNodeConnStr(char *host, int port, char *dbname, * Build up connection string * remote type can be Coordinator, Datanode or application. */ +#ifdef XCP + num = snprintf(connstr, sizeof(connstr), + "host=%s port=%d dbname=%s user=%s application_name=pgxc sslmode=disable options='-c remotetype=%s -c parentnode=%s'", + host, port, dbname, user, remote_type, parent_node); +#else num = snprintf(connstr, sizeof(connstr), "host=%s port=%d dbname=%s user=%s application_name=pgxc options='-c remotetype=%s %s'", host, port, dbname, user, remote_type, pgoptions); +#endif /* Check for overflow */ if (num > 0 && num < sizeof(connstr)) @@ -246,6 +328,8 @@ PGXCNodeClose(NODE_CONNECTION *conn) PQfinish((PGconn *) conn); } + +#ifndef XCP /* * Send SET query to given connection. * Query is sent asynchronously and results are consumed @@ -267,6 +351,7 @@ PGXCNodeSendSetQuery(NODE_CONNECTION *conn, const char *sql_command) return 0; } +#endif /* @@ -338,6 +423,9 @@ pgxc_node_all_free(void) co_handles = NULL; dn_handles = NULL; +#ifdef XCP + HandlesInvalidatePending = false; +#endif } /* @@ -348,9 +436,17 @@ pgxc_node_all_free(void) static void pgxc_node_init(PGXCNodeHandle *handle, int sock) { +#ifdef XCP + char *init_str; +#endif + handle->sock = sock; handle->transaction_status = 'I'; handle->state = DN_CONNECTION_STATE_IDLE; +#ifdef XCP + handle->read_only = true; + handle->ck_resp_rollback = false; +#endif handle->combiner = NULL; #ifdef DN_CONNECTION_DEBUG handle->have_row_desc = false; @@ -360,6 +456,17 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock) handle->inStart = 0; handle->inEnd = 0; handle->inCursor = 0; +#ifdef XCP + /* + * We got a new connection, set on the remote node the session parameters + * if defined. The transaction parameter should be sent after BEGIN + */ + init_str = PGXCNodeGetSessionParamStr(); + if (init_str) + { + pgxc_node_set_query(handle, init_str); + } +#endif } @@ -422,6 +529,9 @@ pgxc_node_receive(const int conn_count, } retry: +#ifdef XCP + CHECK_FOR_INTERRUPTS(); +#endif res_select = select(nfds + 1, &readfds, NULL, NULL, timeout); if (res_select < 0) { @@ -442,8 +552,12 @@ retry: if (res_select == 0) { /* Handle timeout */ - elog(WARNING, "timeout while waiting for response"); - return ERROR_OCCURED; + elog(DEBUG1, "timeout while waiting for response"); +#ifdef XCP + for (i = 0; i < conn_count; i++) + connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL; +#endif + return NO_ERROR_OCCURED; } /* read data */ @@ -553,8 +667,11 @@ retry: if (nread < 0) { +#ifndef XCP + /* too noisy */ if (close_if_error) elog(DEBUG1, "dnrd errno = %d", errno); +#endif if (errno == EINTR) goto retry; /* Some systems return EAGAIN/EWOULDBLOCK for no data */ @@ -739,8 +856,25 @@ get_message(PGXCNodeHandle *conn, int *len, char **msg) void release_handles(void) { +#ifdef XCP + bool destroy = false; +#endif int i; +#ifdef XCP + if (HandlesInvalidatePending) + { + DoInvalidateRemoteHandles(); + return; + } + + /* don't free connection if holding a cluster lock */ + if (cluster_ex_lock_held) + { + return; + } +#endif + if (datanode_count == 0 && coord_count == 0) return; @@ -755,13 +889,32 @@ release_handles(void) if (handle->sock != NO_SOCKET) { +#ifdef XCP + /* + * Connections at this point should be completely inactive, + * otherwise abaandon them. We can not allow not cleaned up + * connection is returned to pool. + */ + if (handle->state != DN_CONNECTION_STATE_IDLE || + handle->transaction_status != 'I') + { + destroy = true; + elog(DEBUG1, "Connection to Datanode %d has unexpected state %d and will be dropped", + handle->nodeoid, handle->state); + } +#else if (handle->state != DN_CONNECTION_STATE_IDLE) elog(DEBUG1, "Connection to Datanode %d has unexpected state %d and will be dropped", handle->nodeoid, handle->state); +#endif pgxc_node_free(handle); } } +#ifdef XCP + if (IS_PGXC_COORDINATOR) + { +#endif /* Collect Coordinator handles */ for (i = 0; i < NumCoords; i++) { @@ -769,20 +922,43 @@ release_handles(void) if (handle->sock != NO_SOCKET) { +#ifdef XCP + /* + * Connections at this point should be completely inactive, + * otherwise abaandon them. We can not allow not cleaned up + * connection is returned to pool. + */ + if (handle->state != DN_CONNECTION_STATE_IDLE || + handle->transaction_status != 'I') + { + destroy = true; + elog(DEBUG1, "Connection to Coordinator %d has unexpected state %d and will be dropped", + handle->nodeoid, handle->state); + } +#else if (handle->state != DN_CONNECTION_STATE_IDLE) elog(DEBUG1, "Connection to Coordinator %d has unexpected state %d and will be dropped", handle->nodeoid, handle->state); +#endif pgxc_node_free(handle); } } +#ifdef XCP + } +#endif /* And finally release all the connections on pooler */ +#ifdef XCP + PoolManagerReleaseConnections(destroy); +#else PoolManagerReleaseConnections(); +#endif datanode_count = 0; coord_count = 0; } +#ifndef XCP /* * cancel a running query due to error while processing rows */ @@ -790,7 +966,7 @@ void cancel_query(void) { int i; - int dn_cancel[NumDataNodes]; + int dn_cancel[NumDataNodes]; int co_cancel[NumCoords]; int dn_count = 0; int co_count = 0; @@ -912,6 +1088,7 @@ clear_all_data(void) handle->error = NULL; } } +#endif /* * Ensure specified amount of data can fit to the incoming buffer and @@ -1202,6 +1379,87 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement, } +#ifdef XCP +/* + * Send PLAN message down to the Data node + */ +int +pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement, + const char *query, const char *planstr, + short num_params, Oid *param_types) +{ + int stmtLen; + int queryLen; + int planLen; + int paramTypeLen; + int msgLen; + char **paramTypes = (char **)palloc(sizeof(char *) * num_params); + int i; + + /* Invalid connection state, return error */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + return EOF; + + /* statement name size (do not allow NULL) */ + stmtLen = strlen(statement) + 1; + /* source query size (do not allow NULL) */ + queryLen = strlen(query) + 1; + /* query plan size (do not allow NULL) */ + planLen = strlen(planstr) + 1; + /* 2 bytes for number of parameters, preceding the type names */ + paramTypeLen = 2; + /* find names of the types of parameters */ + for (i = 0; i < num_params; i++) + { + paramTypes[i] = format_type_be(param_types[i]); + paramTypeLen += strlen(paramTypes[i]) + 1; + } + /* size + pnameLen + queryLen + parameters */ + msgLen = 4 + queryLen + stmtLen + planLen + paramTypeLen; + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) + { + add_error_message(handle, "out of memory"); + return EOF; + } + + handle->outBuffer[handle->outEnd++] = 'p'; + /* size */ + msgLen = htonl(msgLen); + memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4); + handle->outEnd += 4; + /* statement name */ + memcpy(handle->outBuffer + handle->outEnd, statement, stmtLen); + handle->outEnd += stmtLen; + /* source query */ + memcpy(handle->outBuffer + handle->outEnd, query, queryLen); + handle->outEnd += queryLen; + /* query plan */ + memcpy(handle->outBuffer + handle->outEnd, planstr, planLen); + handle->outEnd += planLen; + /* parameter types */ + *((short *)(handle->outBuffer + handle->outEnd)) = htons(num_params); + handle->outEnd += sizeof(num_params); + /* + * instead of parameter ids we should send parameter names (qualified by + * schema name if required). The OIDs of types can be different on + * datanodes. + */ + for (i = 0; i < num_params; i++) + { + int plen = strlen(paramTypes[i]) + 1; + memcpy(handle->outBuffer + handle->outEnd, paramTypes[i], plen); + handle->outEnd += plen; + pfree(paramTypes[i]); + } + pfree(paramTypes); + + return 0; +} +#endif + + /* * Send BIND message down to the Datanode */ @@ -1366,8 +1624,6 @@ pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement, else handle->outBuffer[handle->outEnd++] = '\0'; - handle->state = DN_CONNECTION_STATE_QUERY; - return 0; } @@ -1468,7 +1724,7 @@ pgxc_node_send_sync(PGXCNodeHandle * handle) /* - * Send the GXID down to the Datanode + * Send series of Extended Query protocol messages to the data node */ int pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query, @@ -1489,12 +1745,18 @@ pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query, if (fetch_size >= 0) if (pgxc_node_send_execute(handle, portal, fetch_size)) return EOF; +#ifdef XCP + if (pgxc_node_send_flush(handle)) + return EOF; +#else if (pgxc_node_send_sync(handle)) return EOF; +#endif return 0; } + /* * This method won't return until connection buffer is empty or error occurs * To ensure all data are on the wire before waiting for response @@ -1526,6 +1788,13 @@ pgxc_node_flush_read(PGXCNodeHandle *handle) if (handle == NULL) return; +#ifdef XCP + /* + * Before reading input send Sync to make sure + * we will eventually receive ReadyForQuery + */ + pgxc_node_send_sync(handle); +#endif while(true) { read_result = pgxc_node_read_data(handle, false); @@ -1752,6 +2021,9 @@ pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp) void add_error_message(PGXCNodeHandle *handle, const char *message) { +#ifdef XCP + elog(LOG, "Connection error %s", message); +#endif handle->transaction_status = 'E'; if (handle->error) { @@ -1761,6 +2033,102 @@ add_error_message(PGXCNodeHandle *handle, const char *message) handle->error = pstrdup(message); } + +#ifdef XCP +static int load_balancer = 0; +/* + * Get one of the specified nodes to query replicated data source. + * If session already owns one or more of the requested connection, + * the function returns existing one to avoid contacting pooler. + * Performs basic load balancing. + */ +PGXCNodeHandle * +get_any_handle(List *datanodelist) +{ + ListCell *lc1; + int i, node; + + /* sanity check */ + Assert(list_length(datanodelist) > 0); + + if (HandlesInvalidatePending) + if (DoInvalidateRemoteHandles()) + ereport(ERROR, + (errcode(ERRCODE_QUERY_CANCELED), + errmsg("canceling transaction due to cluster configuration reset by administrator command"))); + + /* loop through local datanode handles */ + for (i = 0, node = load_balancer; i < NumDataNodes; i++, node++) + { + /* At the moment node is an index in the array, and we may need to wrap it */ + if (node >= NumDataNodes) + node -= NumDataNodes; + /* See if handle is already used */ + if (dn_handles[node].sock != NO_SOCKET) + { + foreach(lc1, datanodelist) + { + if (lfirst_int(lc1) == node) + { + /* + * The node is in the list of requested nodes, + * set load_balancer for next time and return the handle + */ + load_balancer = node + 1; + return &dn_handles[node]; + } + } + } + } + + /* + * None of requested nodes is in use, need to get one from the pool. + * Choose one. + */ + for (i = 0, node = load_balancer; i < NumDataNodes; i++, node++) + { + /* At the moment node is an index in the array, and we may need to wrap it */ + if (node >= NumDataNodes) + node -= NumDataNodes; + /* Look only at empty slots, we have already checked existing handles */ + if (dn_handles[node].sock == NO_SOCKET) + { + foreach(lc1, datanodelist) + { + if (lfirst_int(lc1) == node) + { + /* The node is requested */ + List *allocate = list_make1_int(node); + int *fds = PoolManagerGetConnections(allocate, NIL); + + if (!fds) + { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("Failed to get pooled connections"))); + } + + pgxc_node_init(&dn_handles[node], fds[0]); + datanode_count++; + + /* + * set load_balancer for next time and return the handle + */ + load_balancer = node + 1; + return &dn_handles[node]; + } + } + } + } + + /* We should not get here, one of the cases should be met */ + Assert(false); + /* Keep compiler quiet */ + return NULL; +} +#endif + + /* * for specified list return array of PGXCNodeHandles * acquire from pool if needed. @@ -1782,6 +2150,14 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query) /* index of the result array */ int i = 0; +#ifdef XCP + if (HandlesInvalidatePending) + if (DoInvalidateRemoteHandles()) + ereport(ERROR, + (errcode(ERRCODE_QUERY_CANCELED), + errmsg("canceling transaction due to cluster configuration reset by administrator command"))); +#endif + result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles)); if (!result) { @@ -2010,6 +2386,64 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query) return result; } + +#ifdef XCP +PGXCNodeAllHandles * +get_current_handles(void) +{ + PGXCNodeAllHandles *result; + PGXCNodeHandle *node_handle; + int i; + + result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles)); + if (!result) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + result->primary_handle = NULL; + result->co_conn_count = 0; + result->dn_conn_count = 0; + + result->datanode_handles = (PGXCNodeHandle **) + palloc(NumDataNodes * sizeof(PGXCNodeHandle *)); + if (!result->datanode_handles) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + for (i = 0; i < NumDataNodes; i++) + { + node_handle = &dn_handles[i]; + if (node_handle->sock != NO_SOCKET) + result->datanode_handles[result->dn_conn_count++] = node_handle; + } + + result->coord_handles = (PGXCNodeHandle **) + palloc(NumCoords * sizeof(PGXCNodeHandle *)); + if (!result->coord_handles) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + for (i = 0; i < NumCoords; i++) + { + node_handle = &co_handles[i]; + if (node_handle->sock != NO_SOCKET) + result->coord_handles[result->co_conn_count++] = node_handle; + } + + return result; +} +#endif + + /* Free PGXCNodeAllHandles structure */ void pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles) @@ -2027,6 +2461,52 @@ pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles) pfree(pgxc_handles); } +#ifdef XCP +/* + * PGXCNode_getNodeId + * Look at the data cached for handles and return node position + * If node type is PGXC_NODE_COORDINATOR look only in coordinator list, + * if node type is PGXC_NODE_DATANODE look only in datanode list, + * if other (assume PGXC_NODE_NODE) search both, in last case return actual + * node type. + */ +int +PGXCNodeGetNodeId(Oid nodeoid, char *node_type) +{ + int i; + + /* First check datanodes, they referenced more often */ + if (node_type == NULL || *node_type != PGXC_NODE_COORDINATOR) + { + for (i = 0; i < NumDataNodes; i++) + { + if (dn_handles[i].nodeoid == nodeoid) + { + if (node_type) + *node_type = PGXC_NODE_DATANODE; + return i; + } + } + } + /* Then check coordinators */ + if (node_type == NULL || *node_type != PGXC_NODE_DATANODE) + { + for (i = 0; i < NumCoords; i++) + { + if (co_handles[i].nodeoid == nodeoid) + { + if (node_type) + *node_type = PGXC_NODE_COORDINATOR; + return i; + } + } + } + /* Not found, have caller handling it */ + if (node_type) + *node_type = PGXC_NODE_NONE; + return -1; +} +#else /* * PGXCNode_getNodeId * Look at the data cached for handles and return node position @@ -2065,6 +2545,7 @@ PGXCNodeGetNodeId(Oid nodeoid, char node_type) } return res; } +#endif /* * PGXCNode_getNodeOid @@ -2108,20 +2589,354 @@ pgxc_node_str(PG_FUNCTION_ARGS) * Return node position in handles array */ int +#ifdef XCP +PGXCNodeGetNodeIdFromName(char *node_name, char *node_type) +#else PGXCNodeGetNodeIdFromName(char *node_name, char node_type) +#endif { char *nm; Oid nodeoid; if (node_name == NULL) +#ifdef XCP + { + if (node_type) + *node_type = PGXC_NODE_NONE; + return -1; + } +#else return -1; +#endif nm = str_tolower(node_name, strlen(node_name), DEFAULT_COLLATION_OID); nodeoid = get_pgxc_nodeoid(nm); pfree(nm); if (!OidIsValid(nodeoid)) +#ifdef XCP + { + if (node_type) + *node_type = PGXC_NODE_NONE; return -1; + } +#else + return -1; +#endif return PGXCNodeGetNodeId(nodeoid, node_type); } + + +#ifdef XCP +/* + * Remember new value of a session or transaction parameter, and set same + * values on newly connected remote nodes. + */ +void +PGXCNodeSetParam(bool local, const char *name, const char *value) +{ + HTAB *table; + + /* Get the target hash table and invalidate command string */ + if (local) + { + table = local_param_htab; + if (local_params) + resetStringInfo(local_params); + } + else + { + table = session_param_htab; + if (session_params) + resetStringInfo(session_params); + } + + /* Initialize table if empty */ + if (table == NULL) + { + HASHCTL hinfo; + int hflags; + + /* do not bother creating hash table if we about to reset non-existing + * parameter */ + if (value == NULL) + return; + + /* Init parameter hashtable */ + MemSet(&hinfo, 0, sizeof(hinfo)); + hflags = 0; + + hinfo.keysize = NAMEDATALEN; + hinfo.entrysize = sizeof(ParamEntry); + hflags |= HASH_ELEM; + + if (local) + { + /* Local parameters are not valid beyond transaction boundaries */ + hinfo.hcxt = TopTransactionContext; + hflags |= HASH_CONTEXT; + table = hash_create("Remote local params", 16, &hinfo, hflags); + local_param_htab = table; + } + else + { + /* + * Session parameters needs to be in TopMemoryContext, hash table + * is created in TopMemoryContext by default. + */ + table = hash_create("Remote session params", 16, &hinfo, hflags); + session_param_htab = table; + } + } + + if (value) + { + ParamEntry *entry; + /* create entry or replace value for the parameter */ + entry = (ParamEntry *) hash_search(table, name, HASH_ENTER, NULL); + strlcpy((char *) (&entry->value), value, NAMEDATALEN); + } + else + { + /* remove entry */ + hash_search(table, name, HASH_REMOVE, NULL); + /* remove table if it becomes empty */ + if (hash_get_num_entries(table) == 0) + { + hash_destroy(table); + if (local) + local_param_htab = NULL; + else + session_param_htab = NULL; + } + } +} + + +/* + * Forget all parameter values set either for transaction or both transaction + * and session. + */ +void +PGXCNodeResetParams(bool only_local) +{ + if (!only_local && session_param_htab) + { + /* need to explicitly pfree session stuff, it is in TopMemoryContext */ + hash_destroy(session_param_htab); + session_param_htab = NULL; + if (session_params) + { + pfree(session_params->data); + pfree(session_params); + session_params = NULL; + } + } + /* + * no need to explicitly destroy the local_param_htab and local_params, + * it will gone with the transaction memory context. + */ + local_param_htab = NULL; + local_params = NULL; +} + + +static char * +quote_ident_cstr(char *rawstr) +{ + text *rawstr_text; + text *result_text; + char *result; + + rawstr_text = cstring_to_text(rawstr); + result_text = DatumGetTextP(DirectFunctionCall1(quote_ident, + PointerGetDatum(rawstr_text))); + result = text_to_cstring(result_text); + + return result; +} + +static void +get_set_command(HTAB *table, StringInfo command, bool local) +{ + HASH_SEQ_STATUS hseq_status; + ParamEntry *entry; + + if (table == NULL) + return; + + hash_seq_init(&hseq_status, table); + while ((entry = (ParamEntry *) hash_seq_search(&hseq_status))) + { + char *value = NameStr(entry->value); + + if (strlen(value) == 0) + value = "''"; + + appendStringInfo(command, "SET %s %s TO %s;", local ? "LOCAL" : "", + NameStr(entry->name), value); + } +} + + +/* + * Returns SET commands needed to initialize remote session. + * The command may already be biult and valid, return it right away if the case. + * Otherwise build it up. + * To support Distributed Session machinery coordinator should generate and + * send a distributed session identifier to remote nodes. Generate it here. + */ +char * +PGXCNodeGetSessionParamStr(void) +{ + /* + * If no session parameters are set and that is a coordinator we need to set + * global_session anyway, even if there were no other parameters. + * We do not want this string to disappear, so create it in the + * TopMemoryContext. However if we add first session parameter we will need + * to free the buffer and recreate it in the same context as the hash table + * to avoid memory leakage. + */ + if (session_params == NULL) + { + MemoryContext oldcontext = MemoryContextSwitchTo(TopMemoryContext); + session_params = makeStringInfo(); + MemoryContextSwitchTo(oldcontext); + } + + /* If the paramstr invalid build it up */ + if (session_params->len == 0) + { + if (IS_PGXC_COORDINATOR) + appendStringInfo(session_params, "SET global_session TO %s_%d;", + PGXCNodeName, MyProcPid); + get_set_command(session_param_htab, session_params, false); + } + return session_params->len == 0 ? NULL : session_params->data; +} + + +/* + * Returns SET commands needed to initialize transaction on a remote session. + * The command may already be biult and valid, return it right away if the case. + * Otherwise build it up. + */ +char * +PGXCNodeGetTransactionParamStr(void) +{ + /* If no local parameters defined there is nothing to return */ + if (local_param_htab == NULL) + return NULL; + + /* + * If the paramstr invalid build it up. + */ + if (local_params == NULL) + { + MemoryContext oldcontext = MemoryContextSwitchTo(TopTransactionContext); + local_params = makeStringInfo(); + MemoryContextSwitchTo(oldcontext); + } + /* + * If parameter string exists it is valid, it is truncated when parameters + * are modified. + */ + if (local_params->len == 0) + { + get_set_command(local_param_htab, local_params, true); + } + return local_params->len == 0 ? NULL : local_params->data; +} + + +/* + * Send down specified query, read and discard all responses until ReadyForQuery + */ +void +pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query) +{ + pgxc_node_send_query(handle, set_query); + /* + * Now read responses until ReadyForQuery. + * XXX We may need to handle possible errors here. + */ + for (;;) + { + char msgtype; + int msglen; + char *msg; + /* + * If we are in the process of shutting down, we + * may be rolling back, and the buffer may contain other messages. + * We want to avoid a procarray exception + * as well as an error stack overflow. + */ + if (proc_exit_inprogress) + handle->state = DN_CONNECTION_STATE_ERROR_FATAL; + + /* don't read from from the connection if there is a fatal error */ + if (handle->state == DN_CONNECTION_STATE_ERROR_FATAL) + break; + + /* No data available, read more */ + if (!HAS_MESSAGE_BUFFERED(handle)) + { + pgxc_node_receive(1, &handle, NULL); + continue; + } + msgtype = get_message(handle, &msglen, &msg); + /* + * Ignore any response except ReadyForQuery, it allows to go on. + */ + if (msgtype == 'Z') /* ReadyForQuery */ + { + handle->transaction_status = msg[0]; + handle->state = DN_CONNECTION_STATE_IDLE; + handle->combiner = NULL; + break; + } + } +} + + +void +RequestInvalidateRemoteHandles(void) +{ + HandlesInvalidatePending = true; +} + + +/* + * For all handles, mark as they are not in use and discard pending input/output + */ +static bool +DoInvalidateRemoteHandles(void) +{ + int i; + PGXCNodeHandle *handle; + bool result = false; + + HandlesInvalidatePending = false; + + for (i = 0; i < NumCoords; i++) + { + handle = &co_handles[i]; + if (handle->sock != NO_SOCKET) + result = true; + handle->sock = NO_SOCKET; + handle->inStart = handle->inEnd = handle->inCursor = 0; + handle->outEnd = 0; + } + for (i = 0; i < NumDataNodes; i++) + { + handle = &dn_handles[i]; + if (handle->sock != NO_SOCKET) + result = true; + handle->sock = NO_SOCKET; + handle->inStart = handle->inEnd = handle->inCursor = 0; + handle->outEnd = 0; + } + return result; +} +#endif diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 8f038c6abc..be2c387fad 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -24,6 +24,11 @@ * allocated to a session, at most one per Datanode. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -61,9 +66,18 @@ #include <string.h> #include <sys/types.h> #include <sys/socket.h> +#ifdef XCP +#include "pgxc/pause.h" +#include "storage/procarray.h" +#endif /* Configuration options */ +#ifdef XCP +int PoolConnKeepAlive = 600; +int PoolMaintenanceTimeout = 30; +#else int MinPoolSize = 1; +#endif int MaxPoolSize = 100; int PoolerPort = 6667; @@ -80,6 +94,15 @@ typedef struct int port; } PGXCNodeConnectionInfo; +#ifdef XCP +/* Handle to the pool manager (Session's side) */ +typedef struct +{ + /* communication channel */ + PoolPort port; +} PoolHandle; +#endif + /* The root memory context */ static MemoryContext PoolerMemoryContext = NULL; /* @@ -105,11 +128,17 @@ static int is_pool_locked = false; static int server_fd = -1; static int node_info_check(PoolAgent *agent); +#ifdef XCP +static void agent_init(PoolAgent *agent, const char *database, + const char *user_name); +#else static void agent_init(PoolAgent *agent, const char *database, const char *user_name, const char *pgoptions); +#endif static void agent_destroy(PoolAgent *agent); static void agent_create(void); static void agent_handle_input(PoolAgent *agent, StringInfo s); +#ifndef XCP static int agent_session_command(PoolAgent *agent, const char *set_command, PoolCommandType command_type); @@ -117,18 +146,33 @@ static int agent_set_command(PoolAgent *agent, const char *set_command, PoolCommandType command_type); static int agent_temp_command(PoolAgent *agent); +#endif +#ifdef XCP +static DatabasePool *create_database_pool(const char *database, + const char *user_name); +#else static DatabasePool *create_database_pool(const char *database, const char *user_name, const char *pgoptions); +#endif static void insert_database_pool(DatabasePool *pool); static int destroy_database_pool(const char *database, const char *user_name); static void reload_database_pools(PoolAgent *agent); +#ifdef XCP +static DatabasePool *find_database_pool(const char *database, + const char *user_name); +#else static DatabasePool *find_database_pool(const char *database, const char *user_name, const char *pgoptions); +#endif static DatabasePool *remove_database_pool(const char *database, const char *user_name); static int *agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist); +#ifndef XCP static int send_local_commands(PoolAgent *agent, List *datanodelist, List *coordlist); +#endif static int cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist); static PGXCNodePoolSlot *acquire_connection(DatabasePool *dbPool, Oid node); static void agent_release_connections(PoolAgent *agent, bool force_destroy); +#ifndef XCP static void agent_reset_session(PoolAgent *agent); +#endif static void release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, Oid node, bool force_destroy); static void destroy_slot(PGXCNodePoolSlot *slot); @@ -143,14 +187,21 @@ static int *abort_pids(int *count, const char *database, const char *user_name); static char *build_node_conn_str(Oid node, DatabasePool *dbPool); - /* Signal handlers */ static void pooler_die(SIGNAL_ARGS); static void pooler_quickdie(SIGNAL_ARGS); - +#ifdef XCP +static void PoolManagerConnect(const char *database, const char *user_name); +static void pooler_sighup(SIGNAL_ARGS); +static bool shrink_pool(DatabasePool *pool); +static void pools_maintenance(void); +#endif /* * Flags set by interrupt handlers for later service in the main loop. */ +#ifdef XCP +static volatile sig_atomic_t got_SIGHUP = false; +#endif static volatile sig_atomic_t shutdown_requested = false; void @@ -208,7 +259,11 @@ PoolManagerInit() pqsignal(SIGINT, pooler_die); pqsignal(SIGTERM, pooler_die); pqsignal(SIGQUIT, pooler_quickdie); +#ifdef XCP + pqsignal(SIGHUP, pooler_sighup); +#else pqsignal(SIGHUP, SIG_IGN); +#endif /* TODO other signal handlers */ /* We allow SIGQUIT (quickdie) at all times */ @@ -331,17 +386,30 @@ PoolManagerDestroy(void) } +#ifdef XCP +/* + * Connect to the pooler process + */ +static void +#else /* * Get handle to pool manager * Invoked from Postmaster's main loop just before forking off new session * Returned PoolHandle structure will be inherited by session process */ PoolHandle * +#endif GetPoolManagerHandle(void) { PoolHandle *handle; int fdsock; +#ifdef XCP + if (poolHandle) + /* already connected */ + return; +#endif + /* Connect to the pooler */ fdsock = pool_connect(PoolerPort, UnixSocketDir); if (fdsock < 0) @@ -352,7 +420,9 @@ GetPoolManagerHandle(void) (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("failed to connect to pool manager: %m"))); errno = saved_errno; +#ifndef XCP return NULL; +#endif } /* Allocate handle */ @@ -369,7 +439,9 @@ GetPoolManagerHandle(void) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); +#ifndef XCP return NULL; +#endif } handle->port.fdsock = fdsock; @@ -377,12 +449,17 @@ GetPoolManagerHandle(void) handle->port.RecvPointer = 0; handle->port.SendPointer = 0; +#ifdef XCP + poolHandle = handle; +#else return handle; +#endif } +#ifndef XCP /* - * Close handle + * XXX May create on_proc_exit callback instead */ void PoolManagerCloseHandle(PoolHandle *handle) @@ -391,7 +468,7 @@ PoolManagerCloseHandle(PoolHandle *handle) free(handle); handle = NULL; } - +#endif /* * Create agent @@ -444,9 +521,11 @@ agent_create(void) agent->coord_conn_oids = NULL; agent->dn_connections = NULL; agent->coord_connections = NULL; +#ifndef XCP agent->session_params = NULL; agent->local_params = NULL; agent->is_temp = false; +#endif agent->pid = 0; /* Append new agent to the list */ @@ -455,6 +534,8 @@ agent_create(void) MemoryContextSwitchTo(oldcontext); } + +#ifndef XCP /* * session_options * Returns the pgoptions string generated using a particular @@ -508,11 +589,86 @@ char *session_options(void) return options.data; } +#endif + /* * Associate session with specified database and respective connection pool * Invoked from Session process */ +#ifdef XCP +static void +PoolManagerConnect(const char *database, const char *user_name) +{ + int n32; + char msgtype = 'c'; + int unamelen = strlen(user_name); + int dbnamelen = strlen(database); + char atchar = ' '; + + /* Connect to the pooler process if not yet connected */ + GetPoolManagerHandle(); + if (poolHandle == NULL) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to connect to the pooler process"))); + + /* + * Special handling for db_user_namespace=on + * We need to handle per-db users and global users. The per-db users will + * arrive with @dbname and global users just as username. Handle both of + * them appropriately + */ + if (strcmp(GetConfigOption("db_user_namespace", false, false), "on") == 0) + { + if (strchr(user_name, '@') != NULL) + { + Assert(unamelen > dbnamelen + 1); + unamelen -= (dbnamelen + 1); + } + else + { + atchar = '@'; + unamelen++; + } + } + + /* Message type */ + pool_putbytes(&poolHandle->port, &msgtype, 1); + + /* Message length */ + n32 = htonl(dbnamelen + unamelen + 18); + pool_putbytes(&poolHandle->port, (char *) &n32, 4); + + /* PID number */ + n32 = htonl(MyProcPid); + pool_putbytes(&poolHandle->port, (char *) &n32, 4); + + /* Length of Database string */ + n32 = htonl(dbnamelen + 1); + pool_putbytes(&poolHandle->port, (char *) &n32, 4); + + /* Send database name followed by \0 terminator */ + pool_putbytes(&poolHandle->port, database, dbnamelen); + pool_putbytes(&poolHandle->port, "\0", 1); + + /* Length of user name string */ + n32 = htonl(unamelen + 1); + pool_putbytes(&poolHandle->port, (char *) &n32, 4); + + /* Send user name followed by \0 terminator */ + /* Send the '@' char if needed. Already accounted for in len */ + if (atchar == '@') + { + pool_putbytes(&poolHandle->port, user_name, unamelen - 1); + pool_putbytes(&poolHandle->port, "@", 1); + } + else + pool_putbytes(&poolHandle->port, user_name, unamelen); + pool_putbytes(&poolHandle->port, "\0", 1); + pool_flush(&poolHandle->port); +} +#else void PoolManagerConnect(PoolHandle *handle, const char *database, const char *user_name, @@ -564,6 +720,7 @@ PoolManagerConnect(PoolHandle *handle, pool_flush(&handle->port); } +#endif /* * Reconnect to pool manager @@ -572,6 +729,13 @@ PoolManagerConnect(PoolHandle *handle, void PoolManagerReconnect(void) { +#ifdef XCP + /* Connected, disconnect */ + if (poolHandle) + PoolManagerDisconnect(); + + PoolManagerConnect(get_database_name(MyDatabaseId), GetClusterUserName()); +#else PoolHandle *handle; Assert(poolHandle); @@ -582,8 +746,11 @@ PoolManagerReconnect(void) get_database_name(MyDatabaseId), GetUserNameFromId(GetUserId()), session_options()); +#endif } + +#ifndef XCP int PoolManagerSetCommand(PoolCommandType command_type, const char *set_command) { @@ -694,6 +861,7 @@ PoolManagerSendLocalCommand(int dn_count, int* dn_list, int co_count, int* co_li /* Get result */ return pool_recvres(&poolHandle->port); } +#endif /* * Lock/unlock pool manager @@ -706,7 +874,13 @@ PoolManagerLock(bool is_lock) char msgtype = 'o'; int n32; int msglen = 8; +#ifdef XCP + if (poolHandle == NULL) + PoolManagerConnect(get_database_name(MyDatabaseId), + GetClusterUserName()); +#else Assert(poolHandle); +#endif /* Message type */ pool_putbytes(&poolHandle->port, &msgtype, 1); @@ -724,9 +898,15 @@ PoolManagerLock(bool is_lock) /* * Init PoolAgent */ +#ifdef XCP +static void +agent_init(PoolAgent *agent, const char *database, + const char *user_name) +#else static void agent_init(PoolAgent *agent, const char *database, const char *user_name, const char *pgoptions) +#endif { MemoryContext oldcontext; @@ -748,12 +928,21 @@ agent_init(PoolAgent *agent, const char *database, const char *user_name, palloc0(agent->num_coord_connections * sizeof(PGXCNodePoolSlot *)); agent->dn_connections = (PGXCNodePoolSlot **) palloc0(agent->num_dn_connections * sizeof(PGXCNodePoolSlot *)); +#ifdef XCP + /* find database */ + agent->pool = find_database_pool(database, user_name); + + /* create if not found */ + if (agent->pool == NULL) + agent->pool = create_database_pool(database, user_name); +#else /* find database */ agent->pool = find_database_pool(database, user_name, pgoptions); /* create if not found */ if (agent->pool == NULL) agent->pool = create_database_pool(database, user_name, pgoptions); +#endif MemoryContextSwitchTo(oldcontext); @@ -775,6 +964,13 @@ agent_destroy(PoolAgent *agent) /* Discard connections if any remaining */ if (agent->pool) { +#ifdef XCP + /* + * If session is disconnecting while there are active connections + * we can not know if they clean or not, so force destroy them + */ + agent_release_connections(agent, true); +#else /* * Agent is being destroyed, so reset session parameters * before putting back connections to pool. @@ -786,6 +982,7 @@ agent_destroy(PoolAgent *agent) * Force disconnection if there are temporary objects on agent. */ agent_release_connections(agent, agent->is_temp); +#endif } /* find agent in the list */ @@ -813,12 +1010,20 @@ agent_destroy(PoolAgent *agent) void PoolManagerDisconnect(void) { +#ifdef XCP + if (!poolHandle) + return; /* not even connected */ +#else Assert(poolHandle); +#endif pool_putmessage(&poolHandle->port, 'd', NULL, 0); pool_flush(&poolHandle->port); close(Socket(poolHandle->port)); +#ifdef XCP + free(poolHandle); +#endif poolHandle = NULL; } @@ -835,7 +1040,13 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist) int totlen = list_length(datanodelist) + list_length(coordlist); int nodes[totlen + 2]; +#ifdef XCP + if (poolHandle == NULL) + PoolManagerConnect(get_database_name(MyDatabaseId), + GetClusterUserName()); +#else Assert(poolHandle); +#endif /* * Prepare end send message to pool manager. @@ -895,7 +1106,17 @@ PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids) int dblen = dbname ? strlen(dbname) + 1 : 0; int userlen = username ? strlen(username) + 1 : 0; +#ifdef XCP + /* + * New connection may be established to clean connections to + * specified nodes and databases. + */ + if (poolHandle == NULL) + PoolManagerConnect(get_database_name(MyDatabaseId), + GetClusterUserName()); +#else Assert(poolHandle); +#endif /* Message type */ pool_putbytes(&poolHandle->port, &msgtype, 1); @@ -944,6 +1165,16 @@ PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, ch int userlen = username ? strlen(username) + 1 : 0; int dblen = dbname ? strlen(dbname) + 1 : 0; +#ifdef XCP + /* + * New connection may be established to clean connections to + * specified nodes and databases. + */ + if (poolHandle == NULL) + PoolManagerConnect(get_database_name(MyDatabaseId), + GetClusterUserName()); +#endif + nodes[0] = htonl(list_length(datanodelist)); i = 1; if (list_length(datanodelist) != 0) @@ -1008,7 +1239,17 @@ PoolManagerCheckConnectionInfo(void) { int res; +#ifdef XCP + /* + * New connection may be established to clean connections to + * specified nodes and databases. + */ + if (poolHandle == NULL) + PoolManagerConnect(get_database_name(MyDatabaseId), + GetClusterUserName()); +#else Assert(poolHandle); +#endif PgxcNodeListAndCount(); pool_putmessage(&poolHandle->port, 'q', NULL, 0); pool_flush(&poolHandle->port); @@ -1051,9 +1292,10 @@ agent_handle_input(PoolAgent * agent, StringInfo s) { const char *database = NULL; const char *user_name = NULL; +#ifndef XCP const char *pgoptions = NULL; - const char *set_command = NULL; PoolCommandType command_type; +#endif int datanodecount; int coordcount; List *nodelist = NIL; @@ -1073,6 +1315,8 @@ agent_handle_input(PoolAgent * agent, StringInfo s) if (is_pool_locked && (qtype == 'a' || qtype == 'c' || qtype == 'g')) elog(WARNING,"Pool operation cannot run during pool lock"); + elog(DEBUG1, "Pooler is handling command %c from %d", (char) qtype, agent->pid); + switch (qtype) { case 'a': /* ABORT */ @@ -1093,6 +1337,7 @@ agent_handle_input(PoolAgent * agent, StringInfo s) if (pids) pfree(pids); break; +#ifndef XCP case 'b': /* Fire transaction-block commands on given nodes */ /* * Length of message is caused by: @@ -1119,6 +1364,7 @@ agent_handle_input(PoolAgent * agent, StringInfo s) list_free(datanodelist); list_free(coordlist); break; +#endif case 'c': /* CONNECT */ pool_getmessage(&agent->port, s, 0); agent->pid = pq_getmsgint(s, 4); @@ -1126,13 +1372,19 @@ agent_handle_input(PoolAgent * agent, StringInfo s) database = pq_getmsgbytes(s, len); len = pq_getmsgint(s, 4); user_name = pq_getmsgbytes(s, len); +#ifndef XCP len = pq_getmsgint(s, 4); pgoptions = pq_getmsgbytes(s, len); +#endif /* * Coordinator pool is not initialized. * With that it would be impossible to create a Database by default. */ +#ifdef XCP + agent_init(agent, database, user_name); +#else agent_init(agent, database, user_name, pgoptions); +#endif pq_getmsgend(s); break; case 'd': /* DISCONNECT */ @@ -1277,10 +1529,22 @@ agent_handle_input(PoolAgent * agent, StringInfo s) pool_sendres(&agent->port, res); break; case 'r': /* RELEASE CONNECTIONS */ +#ifdef XCP + { + bool destroy; + + pool_getmessage(&agent->port, s, 8); + destroy = (bool) pq_getmsgint(s, 4); + pq_getmsgend(s); + agent_release_connections(agent, destroy); + } +#else pool_getmessage(&agent->port, s, 4); pq_getmsgend(s); agent_release_connections(agent, false); +#endif break; +#ifndef XCP case 's': /* Session-related COMMAND */ pool_getmessage(&agent->port, s, 0); /* Determine if command is local or session */ @@ -1298,6 +1562,7 @@ agent_handle_input(PoolAgent * agent, StringInfo s) /* Send success result */ pool_sendres(&agent->port, res); break; +#endif default: /* EOF or protocol violation */ agent_destroy(agent); return; @@ -1308,6 +1573,7 @@ agent_handle_input(PoolAgent * agent, StringInfo s) } } +#ifndef XCP /* * Manage a session command for pooler */ @@ -1419,6 +1685,7 @@ agent_set_command(PoolAgent *agent, const char *set_command, PoolCommandType com return res; } +#endif /* * acquire connection @@ -1460,6 +1727,7 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist) */ oldcontext = MemoryContextSwitchTo(agent->pool->mcxt); + /* Initialize result */ i = 0; /* Save in array fds of Datanodes first */ @@ -1489,8 +1757,10 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist) * Local parameters are fired only once BEGIN has been launched on * remote nodes. */ +#ifndef XCP if (agent->session_params) PGXCNodeSendSetQuery(slot->conn, agent->session_params); +#endif } result[i++] = PQsocket((PGconn *) agent->dn_connections[node]->conn); @@ -1522,8 +1792,10 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist) * Local parameters are fired only once BEGIN has been launched on * remote nodes. */ +#ifndef XCP if (agent->session_params) PGXCNodeSendSetQuery(slot->conn, agent->session_params); +#endif } result[i++] = PQsocket((PGconn *) agent->coord_connections[node]->conn); @@ -1534,6 +1806,7 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist) return result; } +#ifndef XCP /* * send transaction local commands if any, set the begin sent status in any case */ @@ -1605,6 +1878,7 @@ send_local_commands(PoolAgent *agent, List *datanodelist, List *coordlist) return -res; return 0; } +#endif /* * Cancel query @@ -1664,6 +1938,31 @@ cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlis /* * Return connections back to the pool */ +#ifdef XCP +void +PoolManagerReleaseConnections(bool force) +{ + char msgtype = 'r'; + int n32; + int msglen = 8; + + /* If disconnected from pooler all the connections already released */ + if (!poolHandle) + return; + + /* Message type */ + pool_putbytes(&poolHandle->port, &msgtype, 1); + + /* Message length */ + n32 = htonl(msglen); + pool_putbytes(&poolHandle->port, (char *) &n32, 4); + + /* Lock information */ + n32 = htonl((int) force); + pool_putbytes(&poolHandle->port, (char *) &n32, 4); + pool_flush(&poolHandle->port); +} +#else void PoolManagerReleaseConnections(void) { @@ -1671,6 +1970,8 @@ PoolManagerReleaseConnections(void) pool_putmessage(&poolHandle->port, 'r', NULL, 0); pool_flush(&poolHandle->port); } +#endif + /* * Cancel Query @@ -1736,7 +2037,15 @@ agent_release_connections(PoolAgent *agent, bool force_destroy) if (!agent->dn_connections && !agent->coord_connections) return; +#ifdef XCP + if (!force_destroy && cluster_ex_lock_held) + { + elog(LOG, "Not releasing connection with cluster lock"); + return; + } +#endif +#ifndef XCP /* * If there are some session parameters or temporary objects, * do not put back connections to pool. @@ -1751,6 +2060,7 @@ agent_release_connections(PoolAgent *agent, bool force_destroy) } if ((agent->session_params || agent->is_temp) && !force_destroy) return; +#endif /* * There are possible memory allocations in the core pooler, we want @@ -1788,9 +2098,21 @@ agent_release_connections(PoolAgent *agent, bool force_destroy) agent->coord_connections[i] = NULL; } +#ifdef XCP + /* + * Released connections are now in the pool and we may want to close + * them eventually. Update the oldest_idle value to reflect the latest + * last access time if not already updated.. + */ + if (!force_destroy && agent->pool->oldest_idle == (time_t) 0) + agent->pool->oldest_idle = time(NULL); +#endif + MemoryContextSwitchTo(oldcontext); } + +#ifndef XCP /* * Reset session parameters for given connections in the agent. * This is done before putting back to pool connections that have been @@ -1814,7 +2136,7 @@ agent_reset_session(PoolAgent *agent) /* Reset given slot with parameters */ if (slot) - PGXCNodeSendSetQuery(slot->conn, "SET SESSION AUTHORIZATION DEFAULT;RESET ALL;"); + PGXCNodeSendSetQuery(slot->conn, "SET SESSION AUTHORIZATION DEFAULT;RESET ALL;SET GLOBAL_SESSION TO NONE;"); } } @@ -1827,7 +2149,7 @@ agent_reset_session(PoolAgent *agent) /* Reset given slot with parameters */ if (slot) - PGXCNodeSendSetQuery(slot->conn, "SET SESSION AUTHORIZATION DEFAULT;RESET ALL;"); + PGXCNodeSendSetQuery(slot->conn, "SET SESSION AUTHORIZATION DEFAULT;RESET ALL;SET GLOBAL_SESSION TO NONE;"); } } @@ -1843,6 +2165,7 @@ agent_reset_session(PoolAgent *agent) agent->local_params = NULL; } } +#endif /* @@ -1853,8 +2176,13 @@ agent_reset_session(PoolAgent *agent) * Returns POOL_OK if operation succeed POOL_FAIL in case of OutOfMemory * error and POOL_WEXIST if poll for this database already exist. */ +#ifdef XCP +static DatabasePool *create_database_pool(const char *database, + const char *user_name) +#else static DatabasePool * create_database_pool(const char *database, const char *user_name, const char *pgoptions) +#endif { MemoryContext oldcontext; MemoryContext dbcontext; @@ -1884,8 +2212,13 @@ create_database_pool(const char *database, const char *user_name, const char *pg databasePool->database = pstrdup(database); /* Copy the user name */ databasePool->user_name = pstrdup(user_name); +#ifdef XCP + /* Reset the oldest_idle value */ + databasePool->oldest_idle = (time_t) 0; +#else /* Copy the pgoptions */ databasePool->pgoptions = pstrdup(pgoptions); +#endif if (!databasePool->database) { @@ -2031,8 +2364,14 @@ reload_database_pools(PoolAgent *agent) /* * Find pool for specified database and username in the list */ +#ifdef XCP +static DatabasePool * +find_database_pool(const char *database, + const char *user_name) +#else static DatabasePool * find_database_pool(const char *database, const char *user_name, const char *pgoptions) +#endif { DatabasePool *databasePool; @@ -2040,11 +2379,16 @@ find_database_pool(const char *database, const char *user_name, const char *pgop databasePool = databasePools; while (databasePool) { +#ifdef XCP + if (strcmp(database, databasePool->database) == 0 && + strcmp(user_name, databasePool->user_name) == 0) + break; +#else if (strcmp(database, databasePool->database) == 0 && strcmp(user_name, databasePool->user_name) == 0 && strcmp(pgoptions, databasePool->pgoptions) == 0) break; - +#endif databasePool = databasePool->next; } return databasePool; @@ -2185,6 +2529,9 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, { /* Insert the slot into the array and increase pool size */ nodePool->slot[(nodePool->freeSize)++] = slot; +#ifdef XCP + slot->released = time(NULL); +#endif } else { @@ -2204,6 +2551,10 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot, static PGXCNodePool * grow_pool(DatabasePool *dbPool, Oid node) { +#ifdef XCP + /* if error try to release idle connections and try again */ + bool tryagain = true; +#endif PGXCNodePool *nodePool; bool found; @@ -2211,7 +2562,6 @@ grow_pool(DatabasePool *dbPool, Oid node) nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_ENTER, &found); - if (!found) { nodePool->connstr = build_node_conn_str(node, dbPool); @@ -2233,7 +2583,11 @@ grow_pool(DatabasePool *dbPool, Oid node) nodePool->size = 0; } +#ifdef XCP + while (nodePool->freeSize == 0 && nodePool->size < MaxPoolSize) +#else while (nodePool->size < MinPoolSize || (nodePool->freeSize == 0 && nodePool->size < MaxPoolSize)) +#endif { PGXCNodePoolSlot *slot; @@ -2257,10 +2611,33 @@ grow_pool(DatabasePool *dbPool, Oid node) ereport(LOG, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("failed to connect to Datanode"))); +#ifdef XCP + /* + * If we failed to connect probably number of connections on the + * target node reached max_connections. Try and release idle + * connections and try again. + * We do not want to enter endless loop here and run maintenance + * procedure only once. + * It is not safe to run the maintenance procedure if no connections + * from that pool currently in use - the node pool may be destroyed + * in that case. + */ + if (tryagain && nodePool->size > nodePool->freeSize) + { + pools_maintenance(); + tryagain = false; + continue; + } +#endif break; } slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn); +#ifdef XCP + slot->released = time(NULL); + if (dbPool->oldest_idle == (time_t) 0) + dbPool->oldest_idle = slot->released; +#endif /* Insert at the end of the pool */ nodePool->slot[(nodePool->freeSize)++] = slot; @@ -2326,7 +2703,10 @@ destroy_node_pool(PGXCNodePool *node_pool) static void PoolerLoop(void) { - StringInfoData input_message; + StringInfoData input_message; +#ifdef XCP + time_t last_maintenance = (time_t) 0; +#endif server_fd = pool_listen(PoolerPort, UnixSocketDir); if (server_fd == -1) @@ -2335,6 +2715,7 @@ PoolerLoop(void) return; } initStringInfo(&input_message); + for (;;) { int nfds; @@ -2365,8 +2746,53 @@ PoolerLoop(void) nfds = Max(nfds, sockfd); } - /* wait for event */ +#ifdef XCP + if (PoolMaintenanceTimeout > 0) + { + struct timeval maintenance_timeout; + int timeout_val; + double timediff; + + /* + * Decide the timeout value based on when the last + * maintenance activity was carried out. If the last + * maintenance was done quite a while ago schedule the select + * with no timeout. It will serve any incoming activity + * and if there's none it will cause the maintenance + * to be scheduled as soon as possible + */ + timediff = difftime(time(NULL), last_maintenance); + + if (timediff > PoolMaintenanceTimeout) + timeout_val = 0; + else + timeout_val = PoolMaintenanceTimeout - rint(timediff); + + maintenance_timeout.tv_sec = timeout_val; + maintenance_timeout.tv_usec = 0; + /* wait for event */ + retval = select(nfds + 1, &rfds, NULL, NULL, &maintenance_timeout); + } + else +#endif retval = select(nfds + 1, &rfds, NULL, NULL, NULL); +#ifdef XCP + /* + * Emergency bailout if postmaster has died. This is to avoid the + * necessity for manual cleanup of all postmaster children. + */ + if (!PostmasterIsAlive()) + exit(1); + + /* + * Process any requests or signals received recently. + */ + if (got_SIGHUP) + { + got_SIGHUP = false; + ProcessConfigFile(PGC_SIGHUP); + } +#endif if (shutdown_requested) { for (i = agentCount - 1; i >= 0; i--) @@ -2400,6 +2826,14 @@ PoolerLoop(void) if (FD_ISSET(server_fd, &rfds)) agent_create(); } +#ifdef XCP + else if (retval == 0) + { + /* maintenance timeout */ + pools_maintenance(); + last_maintenance = time(NULL); + } +#endif } } @@ -2530,6 +2964,17 @@ pooler_quickdie(SIGNAL_ARGS) exit(2); } + +#ifdef XCP +static void +pooler_sighup(SIGNAL_ARGS) +{ + got_SIGHUP = true; +} +#endif + + +#ifndef XCP bool IsPoolHandle(void) { @@ -2537,7 +2982,7 @@ IsPoolHandle(void) return false; return true; } - +#endif /* * Given node identifier, dbname and user name build connection string. @@ -2556,13 +3001,145 @@ build_node_conn_str(Oid node, DatabasePool *dbPool) return NULL; } +#ifdef XCP + connstr = PGXCNodeConnStr(NameStr(nodeDef->nodehost), + nodeDef->nodeport, + dbPool->database, + dbPool->user_name, + IS_PGXC_COORDINATOR ? "coordinator" : "datanode", + PGXCNodeName); +#else connstr = PGXCNodeConnStr(NameStr(nodeDef->nodehost), nodeDef->nodeport, dbPool->database, dbPool->user_name, dbPool->pgoptions, IS_PGXC_COORDINATOR ? "coordinator" : "datanode"); +#endif pfree(nodeDef); return connstr; } + + +#ifdef XCP +/* + * Check all pooled connections, and close which have been released more then + * PooledConnKeepAlive seconds ago. + * Return true if shrink operation closed all the connections and pool can be + * ddestroyed, false if there are still connections or pool is in use. + */ +static bool +shrink_pool(DatabasePool *pool) +{ + time_t now = time(NULL); + HASH_SEQ_STATUS hseq_status; + PGXCNodePool *nodePool; + int i; + bool empty = true; + + /* Negative PooledConnKeepAlive disables automatic connection cleanup */ + if (PoolConnKeepAlive < 0) + return false; + + pool->oldest_idle = (time_t) 0; + hash_seq_init(&hseq_status, pool->nodePools); + while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status))) + { + /* Go thru the free slots and destroy those that are free too long */ + for (i = 0; i < nodePool->freeSize; ) + { + PGXCNodePoolSlot *slot = nodePool->slot[i]; + + if (difftime(now, slot->released) > PoolConnKeepAlive) + { + /* connection is idle for long, close it */ + destroy_slot(slot); + /* reduce pool size and total number of connections */ + (nodePool->freeSize)--; + (nodePool->size)--; + /* move last connection in place, if not at last already */ + if (i < nodePool->freeSize) + nodePool->slot[i] = nodePool->slot[nodePool->freeSize]; + } + else + { + if (pool->oldest_idle == (time_t) 0 || + difftime(pool->oldest_idle, slot->released) > 0) + pool->oldest_idle = slot->released; + + i++; + } + } + if (nodePool->size > 0) + empty = false; + else + { + destroy_node_pool(nodePool); + hash_search(pool->nodePools, &nodePool->nodeoid, HASH_REMOVE, NULL); + } + } + + /* + * Last check, if any active agent is referencing the pool do not allow to + * destroy it, because there will be a problem if session wakes up and try + * to get a connection from non existing pool. + * If all such sessions will eventually disconnect the pool will be + * destroyed during next maintenance procedure. + */ + if (empty) + { + for (i = 0; i < agentCount; i++) + { + if (poolAgents[i]->pool == pool) + return false; + } + } + + return empty; +} + + +/* + * Scan connection pools and release connections which are idle for long. + * If pool gets empty after releasing connections it is destroyed. + */ +static void +pools_maintenance(void) +{ + DatabasePool *prev = NULL; + DatabasePool *curr = databasePools; + time_t now = time(NULL); + int count = 0; + + /* Iterate over the pools */ + while (curr) + { + /* + * If current pool has connections to close and it is emptied after + * shrink remove the pool and free memory. + * Otherwithe move to next pool. + */ + if (curr->oldest_idle != (time_t) 0 && + difftime(now, curr->oldest_idle) > PoolConnKeepAlive && + shrink_pool(curr)) + { + MemoryContext mem = curr->mcxt; + curr = curr->next; + if (prev) + prev->next = curr; + else + databasePools = curr; + MemoryContextDelete(mem); + count++; + } + else + { + prev = curr; + curr = curr->next; + } + } + elog(DEBUG1, "Pool maintenance, done in %f seconds, removed %d pools", + difftime(time(NULL), now), count); +} +#endif diff --git a/src/backend/pgxc/pool/poolutils.c b/src/backend/pgxc/pool/poolutils.c index 594aa71af4..e383845101 100644 --- a/src/backend/pgxc/pool/poolutils.c +++ b/src/backend/pgxc/pool/poolutils.c @@ -4,6 +4,11 @@ * * Utilities for Postgres-XC pooler * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -26,8 +31,12 @@ #include "pgxc/pgxcnode.h" #include "access/gtm.h" #include "access/xact.h" +#include "catalog/pgxc_node.h" #include "commands/dbcommands.h" #include "commands/prepare.h" +#ifdef XCP +#include "storage/ipc.h" +#endif #include "storage/procarray.h" #include "utils/acl.h" #include "utils/builtins.h" @@ -88,18 +97,38 @@ pgxc_pool_check(PG_FUNCTION_ARGS) Datum pgxc_pool_reload(PG_FUNCTION_ARGS) { +#ifndef XCP MemoryContext old_context; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to manage pooler")))); +#endif if (IsTransactionBlock()) ereport(ERROR, (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), errmsg("pgxc_pool_reload cannot run inside a transaction block"))); +#ifdef XCP + /* Session is being reloaded, drop prepared and temporary objects */ + DropAllPreparedStatements(); + + /* Reinitialize session, it updates the shared memory table */ + InitMultinodeExecutor(true); + + /* Be sure it is done consistently */ + while (!PoolManagerCheckConnectionInfo()) + { + /* Reload connection information in pooler */ + PoolManagerReloadConnectionInfo(); + } + + /* Signal other sessions to reconnect to pooler if have privileges */ + if (superuser()) + ReloadConnInfoOnBackends(); +#else /* A Datanode has no pooler active, so do not bother about that */ if (IS_PGXC_DATANODE) PG_RETURN_BOOL(true); @@ -145,6 +174,7 @@ pgxc_pool_reload(PG_FUNCTION_ARGS) PoolManagerReconnect(); MemoryContextSwitchTo(old_context); +#endif PG_RETURN_BOOL(true); } @@ -289,6 +319,17 @@ CleanConnection(CleanConnStmt *stmt) foreach(nodelist_item, stmt->nodes) { char *node_name = strVal(lfirst(nodelist_item)); +#ifdef XCP + char node_type = PGXC_NODE_NONE; + stmt_nodes = lappend_int(stmt_nodes, + PGXCNodeGetNodeIdFromName(node_name, + &node_type)); + if (node_type == PGXC_NODE_NONE) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("PGXC Node %s: object not defined", + node_name))); +#else Oid nodeoid = get_pgxc_nodeoid(node_name); if (!OidIsValid(nodeoid)) @@ -299,6 +340,7 @@ CleanConnection(CleanConnStmt *stmt) stmt_nodes = lappend_int(stmt_nodes, PGXCNodeGetNodeId(nodeoid, get_pgxc_nodetype(nodeoid))); +#endif } /* Build lists to be sent to Pooler Manager */ @@ -369,6 +411,20 @@ DropDBCleanConnection(char *dbname) void HandlePoolerReload(void) { +#ifdef XCP + if (proc_exit_inprogress) + return; + + /* Request query cancel, when convenient */ + InterruptPending = true; + QueryCancelPending = true; + + /* Disconnect from the pooler to get new connection infos next time */ + PoolManagerDisconnect(); + + /* Prevent using of cached connections to remote nodes */ + RequestInvalidateRemoteHandles(); +#else MemoryContext old_context; /* A Datanode has no pooler active, so do not bother about that */ @@ -407,4 +463,5 @@ HandlePoolerReload(void) CurrentResourceOwner = NULL; MemoryContextSwitchTo(old_context); +#endif } diff --git a/src/backend/pgxc/pool/postgresql_fdw.c b/src/backend/pgxc/pool/postgresql_fdw.c new file mode 100644 index 0000000000..e6e80805a9 --- /dev/null +++ b/src/backend/pgxc/pool/postgresql_fdw.c @@ -0,0 +1,132 @@ +/*------------------------------------------------------------------------- + * + * postgresql_fdw.c + * foreign-data wrapper for PostgreSQL + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. + * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include "pgxc/postgresql_fdw.h" +#include "catalog/pg_operator.h" +#include "catalog/pg_proc.h" +#include "catalog/pg_type.h" +#include "funcapi.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "nodes/makefuncs.h" +#include "optimizer/clauses.h" +#include "optimizer/planmain.h" +#include "parser/scansup.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/syscache.h" + +#define DEBUG_FDW + +/* + * Check whether the function is IMMUTABLE. + */ +bool +is_immutable_func(Oid funcid) +{ + HeapTuple tp; + bool isnull; + Datum datum; + + tp = SearchSysCache(PROCOID, ObjectIdGetDatum(funcid), 0, 0, 0); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for function %u", funcid); + +#ifdef DEBUG_FDW + /* print function name and its immutability */ + { + char *proname; + datum = SysCacheGetAttr(PROCOID, tp, Anum_pg_proc_proname, &isnull); + proname = pstrdup(DatumGetName(datum)->data); + elog(DEBUG1, "func %s(%u) is%s immutable", proname, funcid, + (DatumGetChar(datum) == PROVOLATILE_IMMUTABLE) ? "" : " not"); + pfree(proname); + } +#endif + + datum = SysCacheGetAttr(PROCOID, tp, Anum_pg_proc_provolatile, &isnull); + ReleaseSysCache(tp); + + return (DatumGetChar(datum) == PROVOLATILE_IMMUTABLE); +} + +/* + * Check whether the ExprState node should be evaluated in foreign server. + * + * An expression which consists of expressions below will be evaluated in + * the foreign server. + * - constant value + * - variable (foreign table column) + * - external parameter (parameter of prepared statement) + * - array + * - bool expression (AND/OR/NOT) + * - NULL test (IS [NOT] NULL) + * - operator + * - IMMUTABLE only + * - It is required that the meaning of the operator be the same as the + * local server in the foreign server. + * - function + * - IMMUTABLE only + * - It is required that the meaning of the operator be the same as the + * local server in the foreign server. + * - scalar array operator (ANY/ALL) + */ +bool +pgxc_is_expr_shippable(Expr *node, bool *has_aggs) +{ +#ifdef XCP + return false; +#else + Shippability_context sc_context; + + /* Create the FQS context */ + memset(&sc_context, 0, sizeof(sc_context)); + sc_context.sc_query = NULL; + sc_context.sc_query_level = 0; + sc_context.sc_for_expr = true; + + /* Walk the expression to check its shippability */ + pgxc_shippability_walker((Node *)node, &sc_context); + + /* + * If caller is interested in knowing, whether the expression has aggregets + * let the caller know about it. The caller is capable of handling such + * expressions. Otherwise assume such an expression as unshippable. + */ + if (has_aggs) + *has_aggs = pgxc_test_shippability_reason(&sc_context, SS_HAS_AGG_EXPR); + else if (pgxc_test_shippability_reason(&sc_context, SS_HAS_AGG_EXPR)) + return false; + + /* + * If the expression unshippable or unsupported by expression shipping + * algorithm, return false. We don't have information about the number of + * nodes involved in expression evaluation, hence even if the expression can + * be evaluated only on single node, return false. + */ + if (pgxc_test_shippability_reason(&sc_context, SS_UNSUPPORTED_EXPR) || + pgxc_test_shippability_reason(&sc_context, SS_UNSHIPPABLE_EXPR) || + pgxc_test_shippability_reason(&sc_context, SS_NEED_SINGLENODE)) + return false; + + /* If nothing wrong found, the expression is shippable */ + return true; +#endif +} diff --git a/src/backend/pgxc/squeue/Makefile b/src/backend/pgxc/squeue/Makefile new file mode 100644 index 0000000000..77d568813b --- /dev/null +++ b/src/backend/pgxc/squeue/Makefile @@ -0,0 +1,19 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for distributed executor's shared memory queue +# +# Portions Copyright (c) 2011 StormDB +# +# IDENTIFICATION +# $PostgreSQL$ +# +#------------------------------------------------------------------------- + +subdir = src/backend/pgxc/squeue +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = squeue.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c new file mode 100644 index 0000000000..0418779199 --- /dev/null +++ b/src/backend/pgxc/squeue/squeue.c @@ -0,0 +1,1509 @@ +/*------------------------------------------------------------------------- + * + * squeue.c + * + * Shared queue is for data exchange in shared memory between sessions, + * one of which is a producer, providing data rows. Others are consumer agents - + * sessions initiated from other datanodes, the main purpose of them is to read + * rows from the shared queue and send then to the parent data node. + * The producer is usually a consumer at the same time, it sends back tuples + * to the parent node without putting it to the queue. + * + * Copyright (c) 2012-2014, TransLattice, Inc. + * + * IDENTIFICATION + * $$ + * + * + *------------------------------------------------------------------------- + */ + +#include <sys/time.h> +#include "postgres.h" + +#include "miscadmin.h" +#include "access/gtm.h" +#include "catalog/pgxc_node.h" +#include "commands/prepare.h" +#include "executor/executor.h" +#include "pgxc/nodemgr.h" +#include "pgxc/pgxc.h" +#include "pgxc/pgxcnode.h" +#include "pgxc/squeue.h" +#include "storage/latch.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "utils/hsearch.h" +#include "utils/resowner.h" + + +int NSQueues = 64; +int SQueueSize = 64; + +#define LONG_TUPLE -42 + +typedef struct ConsumerSync +{ + LWLockId cs_lwlock; /* Synchronize access to the consumer queue */ + Latch cs_latch; /* The latch consumer is waiting on */ +} ConsumerSync; + + +/* + * Shared memory structure to store synchronization info to access shared queues + */ +typedef struct SQueueSync +{ + void *queue; /* NULL if not assigned to any queue */ + Latch sqs_producer_latch; /* the latch producer is waiting on */ + ConsumerSync sqs_consumer_sync[0]; /* actual length is MaxDataNodes-1 is + * not known on compile time */ +} SQueueSync; + +/* Both producer and consumer are working */ +#define CONSUMER_ACTIVE 0 +/* Producer have finished work successfully and waits for consumer */ +#define CONSUMER_EOF 1 +/* Producer encountered error and waits for consumer to disconnect */ +#define CONSUMER_ERROR 2 +/* Consumer is finished with the query, OK to unbind */ +#define CONSUMER_DONE 3 + + +/* State of a single consumer */ +typedef struct +{ + int cs_pid; /* Process id of the consumer session */ + int cs_node; /* Node id of the consumer parent */ + /* + * Queue state. The queue is a cyclic queue where stored tuples in the + * DataRow format, first goes the lengths of the tuple in host format, + * because it never sent over network followed by tuple bytes. + */ + int cs_ntuples; /* Number of tuples in the queue */ + int cs_status; /* See CONSUMER_* defines above */ + char *cs_qstart; /* Where consumer queue begins */ + int cs_qlength; /* The size of the consumer queue */ + int cs_qreadpos; /* The read position in the consumer queue */ + int cs_qwritepos; /* The write position in the consumer queue */ +#ifdef SQUEUE_STAT + long stat_writes; + long stat_reads; + long stat_buff_writes; + long stat_buff_reads; + long stat_buff_returns; +#endif +} ConsState; + +/* Shared queue header */ +typedef struct SQueueHeader +{ + char sq_key[SQUEUE_KEYSIZE]; /* Hash entry key should be at the + * beginning of the hash entry */ + int sq_pid; /* Process id of the producer session */ + int sq_nodeid; /* Node id of the producer parent */ + SQueueSync *sq_sync; /* Associated sinchronization objects */ +#ifdef SQUEUE_STAT + bool stat_finish; + long stat_paused; +#endif + int sq_nconsumers; /* Number of consumers */ + ConsState sq_consumers[0];/* variable length array */ +} SQueueHeader; + + +/* + * Hash table where all shared queues are stored. Key is the queue name, value + * is SharedQueue + */ +static HTAB *SharedQueues = NULL; + + +/* + * Pool of synchronization items + */ +static void *SQueueSyncs; + +#define SQUEUE_SYNC_SIZE \ + (sizeof(SQueueSync) + (MaxDataNodes-1) * sizeof(ConsumerSync)) + +#define GET_SQUEUE_SYNC(idx) \ + ((SQueueSync *) (((char *) SQueueSyncs) + (idx) * SQUEUE_SYNC_SIZE)) + +#define SQUEUE_HDR_SIZE(nconsumers) \ + (sizeof(SQueueHeader) + (nconsumers) * sizeof(ConsState)) + +#define QUEUE_FREE_SPACE(cstate) \ + ((cstate)->cs_ntuples > 0 ? \ + ((cstate)->cs_qreadpos >= (cstate)->cs_qwritepos ? \ + (cstate)->cs_qreadpos - (cstate)->cs_qwritepos : \ + (cstate)->cs_qlength + (cstate)->cs_qreadpos \ + - (cstate)->cs_qwritepos) \ + : (cstate)->cs_qlength) + +#define QUEUE_WRITE(cstate, len, buf) \ + do \ + { \ + if ((cstate)->cs_qwritepos + (len) <= (cstate)->cs_qlength) \ + { \ + memcpy((cstate)->cs_qstart + (cstate)->cs_qwritepos, buf, len); \ + (cstate)->cs_qwritepos += (len); \ + if ((cstate)->cs_qwritepos == (cstate)->cs_qlength) \ + (cstate)->cs_qwritepos = 0; \ + } \ + else \ + { \ + int part = (cstate)->cs_qlength - (cstate)->cs_qwritepos; \ + memcpy((cstate)->cs_qstart + (cstate)->cs_qwritepos, buf, part); \ + (cstate)->cs_qwritepos = (len) - part; \ + memcpy((cstate)->cs_qstart, (buf) + part, (cstate)->cs_qwritepos); \ + } \ + } while(0) + + +#define QUEUE_READ(cstate, len, buf) \ + do \ + { \ + if ((cstate)->cs_qreadpos + (len) <= (cstate)->cs_qlength) \ + { \ + memcpy(buf, (cstate)->cs_qstart + (cstate)->cs_qreadpos, len); \ + (cstate)->cs_qreadpos += (len); \ + if ((cstate)->cs_qreadpos == (cstate)->cs_qlength) \ + (cstate)->cs_qreadpos = 0; \ + } \ + else \ + { \ + int part = (cstate)->cs_qlength - (cstate)->cs_qreadpos; \ + memcpy(buf, (cstate)->cs_qstart + (cstate)->cs_qreadpos, part); \ + (cstate)->cs_qreadpos = (len) - part; \ + memcpy((buf) + part, (cstate)->cs_qstart, (cstate)->cs_qreadpos); \ + } \ + } while(0) + + +static bool sq_push_long_tuple(ConsState *cstate, RemoteDataRow datarow); +static void sq_pull_long_tuple(ConsState *cstate, RemoteDataRow datarow, + ConsumerSync *sync); + +/* + * SharedQueuesInit + * Initialize the reference on the shared memory hash table where all shared + * queues are stored. Invoked during postmaster initialization. + */ +void +SharedQueuesInit(void) +{ + HASHCTL info; + int hash_flags; + bool found; + + info.keysize = SQUEUE_KEYSIZE; + info.entrysize = SQUEUE_SIZE; + hash_flags = HASH_ELEM; + + SharedQueues = ShmemInitHash("Shared Queues", NUM_SQUEUES, + NUM_SQUEUES, &info, hash_flags); + + /* + * Synchronization stuff is in separate structure because we need to + * initialize all items now while in the postmaster. + * The structure is actually an array, each array entry is assigned to + * each instance of SharedQueue in use. + */ + SQueueSyncs = ShmemInitStruct("Shared Queues Sync", + SQUEUE_SYNC_SIZE * NUM_SQUEUES, + &found); + if (!found) + { + int i; + + for (i = 0; i < NUM_SQUEUES; i++) + { + SQueueSync *sqs = GET_SQUEUE_SYNC(i); + int j; + + sqs->queue = NULL; + InitSharedLatch(&sqs->sqs_producer_latch); + for (j = 0; j < MaxDataNodes-1; j++) + { + InitSharedLatch(&sqs->sqs_consumer_sync[j].cs_latch); + sqs->sqs_consumer_sync[j].cs_lwlock = LWLockAssign(); + } + } + } +} + + +Size +SharedQueueShmemSize(void) +{ + Size sq_size; + Size sqs_size; + + sq_size = mul_size(NUM_SQUEUES, SQUEUE_SIZE); + sqs_size = mul_size(NUM_SQUEUES, SQUEUE_SYNC_SIZE); + + return add_size(sq_size, sqs_size); +} + +/* + * SharedQueueAcquire + * Reserve a named shared queue for future data exchange between processes + * supplying tuples to remote Datanodes. Invoked when a remote query plan is + * registered on the Datanode. The number of consumers is known at this point, + * so shared queue may be formatted during reservation. The first process that + * is acquiring the shared queue on the Datanode does the formatting. + */ +void +SharedQueueAcquire(const char *sqname, int ncons) +{ + bool found; + SharedQueue sq; + + Assert(IsConnFromDatanode()); + Assert(ncons > 0); + +tryagain: + LWLockAcquire(SQueuesLock, LW_EXCLUSIVE); + + sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_ENTER, &found); + /* First process acquiring queue should format it */ + if (!found) + { + int qsize; /* Size of one queue */ + int i; + char *heapPtr; + + elog(LOG, "Format squeue %s for %d consumers", sqname, ncons); + + /* Initialize the shared queue */ + sq->sq_pid = 0; + sq->sq_nodeid = -1; +#ifdef SQUEUE_STAT + sq->stat_finish = false; + sq->stat_paused = 0; +#endif + /* + * Assign sync object (latches to wait on) + * XXX We may want to optimize this and do smart search instead of + * iterating the array. + */ + for (i = 0; i < NUM_SQUEUES; i++) + { + SQueueSync *sqs = GET_SQUEUE_SYNC(i); + if (sqs->queue == NULL) + { + sqs->queue = (void *) sq; + sq->sq_sync = sqs; + break; + } + } + + sq->sq_nconsumers = ncons; + /* Determine queue size for a single consumer */ + qsize = (SQUEUE_SIZE - SQUEUE_HDR_SIZE(sq->sq_nconsumers)) / sq->sq_nconsumers; + + heapPtr = (char *) sq; + /* Skip header */ + heapPtr += SQUEUE_HDR_SIZE(sq->sq_nconsumers); + /* Set up consumer queues */ + for (i = 0; i < ncons; i++) + { + ConsState *cstate = &(sq->sq_consumers[i]); + + cstate->cs_pid = 0; + cstate->cs_node = -1; + cstate->cs_ntuples = 0; + cstate->cs_status = CONSUMER_ACTIVE; + cstate->cs_qstart = heapPtr; + cstate->cs_qlength = qsize; + cstate->cs_qreadpos = 0; + cstate->cs_qwritepos = 0; + heapPtr += qsize; + } + Assert(heapPtr <= ((char *) sq) + SQUEUE_SIZE); + } + else + { + /* + * A race condition is possible here. The previous operation might use + * the same Shared Queue name if that was different execution of the + * same Portal. So here we should try to determine if that Shared Queue + * belongs to this execution or that is not-yet-released Shared Queue + * of previous operation. + * Though at the moment I am not sure, but I believe the BIND stage is + * only happening after completion of ACQUIRE stage, so it is enough + * to verify the producer (the very first node that binds) is not bound + * yet. If it is bound, sleep for a moment and try again. No reason to + * sleep longer, the producer needs just a quantum of CPU time to UNBIND + * itself. + */ + if (sq->sq_pid != 0) + { + int selfid; /* Node Id of the parent data node */ + int i; + char ntype = PGXC_NODE_DATANODE; + bool old_squeue = true; + + selfid = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE, &ntype); + for (i = 0; i < sq->sq_nconsumers; i++) + { + ConsState *cstate = &(sq->sq_consumers[i]); + if (cstate->cs_node == selfid) + { + SQueueSync *sqsync = sq->sq_sync; + + LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, + LW_EXCLUSIVE); + /* verify status */ + if (cstate->cs_status != CONSUMER_DONE) + old_squeue = false; + + LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); + break; + } + } + if (old_squeue) + { + LWLockRelease(SQueuesLock); + pg_usleep(1L); + goto tryagain; + } + + } + } + LWLockRelease(SQueuesLock); +} + + +/* + * SharedQueueBind + * Bind to the shared queue specified by sqname either as a consumer or as a + * producer. The first process that binds to the shared queue becomes a producer + * and receives the consumer map, others become consumers and receive queue + * indexes to read tuples from. + * The consNodes int list identifies the nodes involved in the current step. + * The distNodes int list describes result distribution of the current step. + * The consNodes should be a subset of distNodes. + * The myindex and consMap parameters are binding results. If caller process + * is bound to the query as a producer myindex is set to -1 and index of the + * each consumer (order number in the consNodes) is stored to the consMap array + * at the position of the node in the distNodes. For the producer node + * SQ_CONS_SELF is stored, nodes from distNodes list which are not members of + * consNodes or if it was reported they won't read results, they are represented + * as SQ_CONS_NONE. + */ +SharedQueue +SharedQueueBind(const char *sqname, List *consNodes, + List *distNodes, int *myindex, int *consMap) +{ + bool found; + SharedQueue sq; + int selfid; /* Node Id of the parent data node */ + char ntype = PGXC_NODE_DATANODE; + + LWLockAcquire(SQueuesLock, LW_EXCLUSIVE); + + selfid = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE, &ntype); + + sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_FIND, &found); + if (!found) + elog(PANIC, "Shared queue %s not found", sqname); + if (sq->sq_pid == 0) + { + /* Producer */ + int i; + ListCell *lc; + + Assert(consMap); + + elog(LOG, "Bind node %s to squeue of step %s as a producer", + PGXC_PARENT_NODE, sqname); + + /* Initialize the shared queue */ + sq->sq_pid = MyProcPid; + sq->sq_nodeid = selfid; + OwnLatch(&sq->sq_sync->sqs_producer_latch); + + i = 0; + foreach(lc, distNodes) + { + int nodeid = lfirst_int(lc); + + /* + * Producer won't go to shared queue to hand off tuple to itself, + * so we do not need to create queue for that entry. + */ + if (nodeid == selfid) + { + /* Producer must be in the consNodes list */ + Assert(list_member_int(consNodes, nodeid)); + consMap[i++] = SQ_CONS_SELF; + } + /* + * This node may connect as a consumer, store consumer id to the map + * and initialize consumer queue + */ + else if (list_member_int(consNodes, nodeid)) + { + ConsState *cstate; + int j; + + for (j = 0; j < sq->sq_nconsumers; j++) + { + cstate = &(sq->sq_consumers[j]); + if (cstate->cs_node == nodeid) + { + /* The process already reported that queue won't read */ + elog(LOG, "Node %d of step %s is released already", + nodeid, sqname); + consMap[i++] = SQ_CONS_NONE; + break; + } + else if (cstate->cs_node == -1) + { + /* found unused slot, assign the consumer to it */ + consMap[i++] = j; + cstate->cs_node = nodeid; + break; + } + } + } + /* + * Consumer from this node won't ever connect as upper level step + * is not executed on the node. Discard resuls that may go to that + * node, if any. + */ + else + { + consMap[i++] = SQ_CONS_NONE; + } + } + + if (myindex) + *myindex = -1; + } + else + { + int nconsumers; + ListCell *lc; + + /* Producer should be different process */ + Assert(sq->sq_pid != MyProcPid); + + elog(LOG, "Bind node %s to squeue of step %s as a consumer of process %d", PGXC_PARENT_NODE, sqname, sq->sq_pid); + + /* Sanity checks */ + Assert(myindex); + *myindex = -1; + /* Ensure the passed in consumer list matches the queue */ + nconsumers = 0; + foreach (lc, consNodes) + { + int nodeid = lfirst_int(lc); + int i; + + if (nodeid == sq->sq_nodeid) + { + /* + * This node is a producer it should be in the consumer list, + * but no consumer queue for it + */ + continue; + } + + /* find consumer queue for the node */ + for (i = 0; i < sq->sq_nconsumers; i++) + { + ConsState *cstate = &(sq->sq_consumers[i]); + if (cstate->cs_node == nodeid) + { + nconsumers++; + if (nodeid == selfid) + { + /* + * Current consumer queue is that from which current + * session will be sending out data rows. + * Initialize the queue to let producer know we are + * here and runnng. + */ + SQueueSync *sqsync = sq->sq_sync; + + LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, + LW_EXCLUSIVE); + /* Make sure no consumer bound to the queue already */ + Assert(cstate->cs_pid == 0); + /* make sure the queue is ready to read */ + Assert(cstate->cs_qlength > 0); + /* verify status */ + if (cstate->cs_status == CONSUMER_ERROR || + cstate->cs_status == CONSUMER_DONE) + { + /* + * Producer failed by the time the consumer connect. + * Change status to "Done" to allow producer unbind + * and report problem to the parent. + */ + cstate->cs_status = CONSUMER_DONE; + /* Producer may be waiting for status change */ + SetLatch(&sqsync->sqs_producer_latch); + LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); + LWLockRelease(SQueuesLock); + ereport(ERROR, + (errcode(ERRCODE_PRODUCER_ERROR), + errmsg("producer error"))); + } + /* + * Any other status is acceptable. Normally it would be + * ACTIVE. If producer have had only few rows to emit + * and it is already done the status would be EOF. + */ + /* Set up the consumer */ + cstate->cs_pid = MyProcPid; + /* return found index */ + *myindex = i; + OwnLatch(&sqsync->sqs_consumer_sync[i].cs_latch); + LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); + } + break; + } + } + /* Check if entry was found and therefore loop was broken */ + Assert(i < sq->sq_nconsumers); + } + /* Check the consumer is found */ + Assert(*myindex != -1); + Assert(sq->sq_nconsumers == nconsumers); + } + LWLockRelease(SQueuesLock); + return sq; +} + + +/* + * Push data from the local tuplestore to the queue for specified consumer. + * Return true if succeeded and the tuplestore is now empty. Return false + * if specified queue has not enough room for the next tuple. + */ +static bool +SharedQueueDump(SharedQueue squeue, int consumerIdx, + TupleTableSlot *tmpslot, Tuplestorestate *tuplestore) +{ + ConsState *cstate = &(squeue->sq_consumers[consumerIdx]); + + /* discard stored data if consumer is not active */ + if (cstate->cs_status != CONSUMER_ACTIVE) + { + tuplestore_clear(tuplestore); + return true; + } + + /* + * Tuplestore does not clear eof flag on the active read pointer, causing + * the store is always in EOF state once reached when there is a single + * read pointer. We do not want behavior like this and workaround by using + * secondary read pointer. Primary read pointer (0) is active when we are + * writing to the tuple store, also it is used to bookmark current position + * when reading to be able to roll back and return just read tuple back to + * the store if we failed to write it out to the queue. + * Secondary read pointer is for reading, and its eof flag is cleared if a + * tuple is written to the store. + */ + tuplestore_select_read_pointer(tuplestore, 1); + + /* If we have something in the tuplestore try to push this to the queue */ + while (!tuplestore_ateof(tuplestore)) + { + /* save position */ + tuplestore_copy_read_pointer(tuplestore, 1, 0); + + /* Try to get next tuple to the temporary slot */ + if (!tuplestore_gettupleslot(tuplestore, true, false, tmpslot)) + { + /* false means the tuplestore in EOF state */ + break; + } +#ifdef SQUEUE_STAT + cstate->stat_buff_reads++; +#endif + + /* The slot should contain a data row */ + Assert(tmpslot->tts_datarow); + + /* check if queue has enough room for the data */ + if (QUEUE_FREE_SPACE(cstate) < sizeof(int) + tmpslot->tts_datarow->msglen) + { + /* + * If stored tuple does not fit empty queue we are entering special + * procedure of pushing it through. + */ + if (cstate->cs_ntuples <= 0) + { + /* + * If pushing throw is completed wake up and proceed to next + * tuple, there could be enough space in the consumer queue to + * fit more. + */ + bool done = sq_push_long_tuple(cstate, tmpslot->tts_datarow); + + /* + * sq_push_long_tuple writes some data anyway, so wake up + * the consumer. + */ + SetLatch(&squeue->sq_sync->sqs_consumer_sync[consumerIdx].cs_latch); + + if (done) + continue; + } + + /* Restore read position to get same tuple next time */ + tuplestore_copy_read_pointer(tuplestore, 0, 1); +#ifdef SQUEUE_STAT + cstate->stat_buff_returns++; +#endif + + /* We might advance the mark, try to truncate */ + tuplestore_trim(tuplestore); + + /* Prepare for writing, set proper read pointer */ + tuplestore_select_read_pointer(tuplestore, 0); + + /* ... and exit */ + return false; + } + else + { + /* Enqueue data */ + QUEUE_WRITE(cstate, sizeof(int), (char *) &tmpslot->tts_datarow->msglen); + QUEUE_WRITE(cstate, tmpslot->tts_datarow->msglen, tmpslot->tts_datarow->msg); + + /* Increment tuple counter. If it was 0 consumer may be waiting for + * data so try to wake it up */ + if ((cstate->cs_ntuples)++ == 0) + SetLatch(&squeue->sq_sync->sqs_consumer_sync[consumerIdx].cs_latch); + } + } + + /* Remove rows we have just read */ + tuplestore_trim(tuplestore); + + /* prepare for writes, set read pointer 0 as active */ + tuplestore_select_read_pointer(tuplestore, 0); + + return true; +} + + +/* + * SharedQueueWrite + * Write data from the specified slot to the specified queue. If the + * tuplestore passed in has tuples try and write them first. + * If specified queue is full the tuple is put into the tuplestore which is + * created if necessary + */ +void +SharedQueueWrite(SharedQueue squeue, int consumerIdx, + TupleTableSlot *slot, Tuplestorestate **tuplestore, + MemoryContext tmpcxt) +{ + ConsState *cstate = &(squeue->sq_consumers[consumerIdx]); + SQueueSync *sqsync = squeue->sq_sync; + LWLockId clwlock = sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock; + RemoteDataRow datarow; + bool free_datarow; + + Assert(cstate->cs_qlength > 0); + + LWLockAcquire(clwlock, LW_EXCLUSIVE); + +#ifdef SQUEUE_STAT + cstate->stat_writes++; +#endif + + /* + * If we have anything in the local storage try to dump this first, + * but do not try to dump often to avoid overhead of creating temporary + * tuple slot. It should be OK to dump if queue is half empty. + */ + if (*tuplestore) + { + bool dumped = false; + + if (QUEUE_FREE_SPACE(cstate) > cstate->cs_qlength / 2) + { + TupleTableSlot *tmpslot; + + tmpslot = MakeSingleTupleTableSlot(slot->tts_tupleDescriptor); + dumped = SharedQueueDump(squeue, consumerIdx, tmpslot, *tuplestore); + ExecDropSingleTupleTableSlot(tmpslot); + } + if (!dumped) + { + /* No room to even dump local store, append the tuple to the store + * and exit */ +#ifdef SQUEUE_STAT + cstate->stat_buff_writes++; +#endif + LWLockRelease(clwlock); + tuplestore_puttupleslot(*tuplestore, slot); + return; + } + } + + /* Get datarow from the tuple slot */ + if (slot->tts_datarow) + { + /* + * The function ExecCopySlotDatarow always make a copy, but here we + * can optimize and avoid copying the data, so we just get the reference + */ + datarow = slot->tts_datarow; + free_datarow = false; + } + else + { + datarow = ExecCopySlotDatarow(slot, tmpcxt); + free_datarow = true; + } + if (QUEUE_FREE_SPACE(cstate) < sizeof(int) + datarow->msglen) + { + /* Not enough room, store tuple locally */ + LWLockRelease(clwlock); + + /* clean up */ + if (free_datarow) + pfree(datarow); + + /* Create tuplestore if does not exist */ + if (*tuplestore == NULL) + { + int ptrno; + char storename[64]; + +#ifdef SQUEUE_STAT + elog(LOG, "Start buffering %s node %d, %d tuples in queue, %ld writes and %ld reads so far", + squeue->sq_key, cstate->cs_node, cstate->cs_ntuples, cstate->stat_writes, cstate->stat_reads); +#endif + *tuplestore = tuplestore_begin_datarow(false, work_mem, tmpcxt); + /* We need is to be able to remember/restore the read position */ + snprintf(storename, 64, "%s node %d", squeue->sq_key, cstate->cs_node); + tuplestore_collect_stat(*tuplestore, storename); + /* + * Allocate a second read pointer to read from the store. We know + * it must have index 1, so needn't store that. + */ + ptrno = tuplestore_alloc_read_pointer(*tuplestore, 0); + Assert(ptrno == 1); + } + +#ifdef SQUEUE_STAT + cstate->stat_buff_writes++; +#endif + /* Append the slot to the store... */ + tuplestore_puttupleslot(*tuplestore, slot); + + /* ... and exit */ + return; + } + else + { + /* do not supply data to closed consumer */ + if (cstate->cs_status == CONSUMER_ACTIVE) + { + /* write out the data */ + QUEUE_WRITE(cstate, sizeof(int), (char *) &datarow->msglen); + QUEUE_WRITE(cstate, datarow->msglen, datarow->msg); + /* Increment tuple counter. If it was 0 consumer may be waiting for + * data so try to wake it up */ + if ((cstate->cs_ntuples)++ == 0) + SetLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch); + } + + /* clean up */ + if (free_datarow) + pfree(datarow); + } + LWLockRelease(clwlock); +} + + +/* + * SharedQueueRead + * Read one data row from the specified queue into the provided tupleslot. + * Returns true if EOF is reached on the specified consumer queue. + * If the queue is empty, behavior is controlled by the canwait parameter. + * If canwait is true it is waiting while row is available or EOF or error is + * reported, if it is false, the slot is emptied and false is returned. + */ +bool +SharedQueueRead(SharedQueue squeue, int consumerIdx, + TupleTableSlot *slot, bool canwait) +{ + ConsState *cstate = &(squeue->sq_consumers[consumerIdx]); + SQueueSync *sqsync = squeue->sq_sync; + RemoteDataRow datarow; + int datalen; + + Assert(cstate->cs_qlength > 0); + + LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE); + + Assert(cstate->cs_status != CONSUMER_DONE); + while (cstate->cs_ntuples <= 0) + { + if (cstate->cs_status == CONSUMER_EOF) + { + /* Inform producer the consumer have done the job */ + cstate->cs_status = CONSUMER_DONE; + /* no need to receive notifications */ + DisownLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch); + /* producer done the job and no more rows expected, clean up */ + LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock); + ExecClearTuple(slot); + /* + * notify the producer, it may be waiting while consumers + * are finishing + */ + SetLatch(&sqsync->sqs_producer_latch); + elog(LOG, "EOF reached while reading from squeue, exiting"); + return true; + } + else if (cstate->cs_status == CONSUMER_ERROR) + { + /* + * There was a producer error while waiting. + * Release all the locks and report problem to the caller. + */ + LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock); + /* + * Reporting error will cause transaction rollback and clean up of + * all portals. We can not mark the portal so it does not access + * the queue so we should hold it for now. We should prevent queue + * unbound in between. + */ + ereport(ERROR, + (errcode(ERRCODE_PRODUCER_ERROR), + errmsg("producer error"))); + } + if (canwait) + { + /* Prepare waiting on empty buffer */ + ResetLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch); + LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock); + /* Wait for notification about available info */ + WaitLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1); + /* got the notification, restore lock and try again */ + LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE); + } + else + { + LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock); + ExecClearTuple(slot); + return false; + } + } + /* have at least one row, read it in and store to slot */ + QUEUE_READ(cstate, sizeof(int), (char *) (&datalen)); + datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + datalen); + datarow->msgnode = InvalidOid; + datarow->msglen = datalen; + if (datalen > cstate->cs_qlength - sizeof(int)) + sq_pull_long_tuple(cstate, datarow, + &sqsync->sqs_consumer_sync[consumerIdx]); + else + QUEUE_READ(cstate, datalen, datarow->msg); + ExecStoreDataRowTuple(datarow, slot, true); + (cstate->cs_ntuples)--; +#ifdef SQUEUE_STAT + cstate->stat_reads++; +#endif + /* sanity check */ + Assert((cstate->cs_ntuples == 0) == (cstate->cs_qreadpos == cstate->cs_qwritepos)); + LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock); + return false; +} + + +/* + * Mark specified consumer as closed discarding all input which may already be + * in the queue. + * If consumerIdx is -1 the producer is cleaned up. Producer need to wait for + * consumers before releasing the queue, so if there are yet active consumers, + * they are notified about the problem and they should disconnect from the + * queue as soon as possible. + */ +void +SharedQueueReset(SharedQueue squeue, int consumerIdx) +{ + SQueueSync *sqsync = squeue->sq_sync; + + if (consumerIdx == -1) + { + int i; + + /* check queue states */ + for (i = 0; i < squeue->sq_nconsumers; i++) + { + ConsState *cstate = &squeue->sq_consumers[i]; + LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); + + /* + * If producer being reset before it is reached the end of the + * result set, that means consumer probably would not get all + * the rows and it should report error if the consumer's parent ever + * try to read. No need to raise error if consumer is just closed. + * If consumer is done already we do not need to change the status. + */ + if (cstate->cs_status != CONSUMER_EOF && + cstate->cs_status != CONSUMER_DONE) + { + elog(LOG, "Consumer %d of producer %s is cancelled", i, squeue->sq_key); + cstate->cs_status = CONSUMER_ERROR; + /* discard tuples which may already be in the queue */ + cstate->cs_ntuples = 0; + /* keep consistent with cs_ntuples*/ + cstate->cs_qreadpos = cstate->cs_qwritepos = 0; + + /* wake up consumer if it is sleeping */ + SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch); + } + LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); + } + elog(LOG, "Reset producer %s", squeue->sq_key); + } + else + { + ConsState *cstate = &(squeue->sq_consumers[consumerIdx]); + LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, + LW_EXCLUSIVE); + + if (cstate->cs_status != CONSUMER_DONE) + { + /* Inform producer the consumer have done the job */ + cstate->cs_status = CONSUMER_DONE; + /* + * No longer need to receive notifications. If consumer has not + * connected the latch is not owned + */ + if (cstate->cs_pid > 0) + DisownLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch); + /* + * notify the producer, it may be waiting while consumers + * are finishing + */ + SetLatch(&sqsync->sqs_producer_latch); + elog(LOG, "Reset consumer %d of %s", consumerIdx, squeue->sq_key); + } + + LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock); + } +} + + +/* + * Assume that not yet connected consumers won't connect and reset them. + * That should allow to Finish/UnBind the queue gracefully and prevent + * producer hanging. + */ +int +SharedQueueResetNotConnected(SharedQueue squeue) +{ + SQueueSync *sqsync = squeue->sq_sync; + int result = 0; + int i; + + /* check queue states */ + for (i = 0; i < squeue->sq_nconsumers; i++) + { + ConsState *cstate = &squeue->sq_consumers[i]; + LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); + + if (cstate->cs_pid == 0 && + cstate->cs_status != CONSUMER_EOF && + cstate->cs_status != CONSUMER_DONE) + { + result++; + elog(LOG, "Consumer %d of producer %s is cancelled", i, squeue->sq_key); + cstate->cs_status = CONSUMER_ERROR; + /* discard tuples which may already be in the queue */ + cstate->cs_ntuples = 0; + /* keep consistent with cs_ntuples*/ + cstate->cs_qreadpos = cstate->cs_qwritepos = 0; + + /* wake up consumer if it is sleeping */ + SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch); + } + LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); + } + elog(LOG, "Reset producer %s", squeue->sq_key); +} + + +/* + * Determine if producer can safely pause work. + * The producer can pause if all consumers have enough data to read while + * producer is sleeping. + * Obvoius case when the producer can not pause if at least one queue is empty. + */ +bool +SharedQueueCanPause(SharedQueue squeue) +{ + SQueueSync *sqsync = squeue->sq_sync; + bool result = true; + int usedspace; + int ncons; + int i; + + usedspace = 0; + ncons = 0; + for (i = 0; result && (i < squeue->sq_nconsumers); i++) + { + ConsState *cstate = &(squeue->sq_consumers[i]); + LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_SHARED); + /* + * Count only consumers that may be blocked. + * If producer has finished scanning and pushing local buffers some + * consumers may be finished already. + */ + if (cstate->cs_status == CONSUMER_ACTIVE) + { + /* can not pause if some queue is empty */ + result = (cstate->cs_ntuples > 0); + usedspace += (cstate->cs_qwritepos > cstate->cs_qreadpos ? + cstate->cs_qwritepos - cstate->cs_qreadpos : + cstate->cs_qlength + cstate->cs_qwritepos + - cstate->cs_qreadpos); + ncons++; + } + LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); + } + /* + * Pause only if average consumer queue is full more then on half. + */ + if (result) + result = (usedspace / ncons > squeue->sq_consumers[0].cs_qlength / 2); +#ifdef SQUEUE_STAT + if (result) + squeue->stat_paused++; +#endif + return result; +} + + +int +SharedQueueFinish(SharedQueue squeue, TupleDesc tupDesc, + Tuplestorestate **tuplestore) +{ + SQueueSync *sqsync = squeue->sq_sync; + TupleTableSlot *tmpslot = NULL; + int i; + int nstores = 0; + + for (i = 0; i < squeue->sq_nconsumers; i++) + { + ConsState *cstate = &squeue->sq_consumers[i]; + LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); +#ifdef SQUEUE_STAT + if (!squeue->stat_finish) + elog(LOG, "Finishing %s node %d, %ld writes and %ld reads so far, %ld buffer writes, %ld buffer reads, %ld tuples returned to buffer", + squeue->sq_key, cstate->cs_node, cstate->stat_writes, cstate->stat_reads, cstate->stat_buff_writes, cstate->stat_buff_reads, cstate->stat_buff_returns); +#endif + /* + * if the tuplestore has data and consumer queue has space for some + * try to push rows to the queue. We do not want to do that often + * to avoid overhead of temp tuple slot allocation. + */ + if (tuplestore[i]) + { + /* If the consumer is not reading just destroy the tuplestore */ + if (cstate->cs_status != CONSUMER_ACTIVE) + { + tuplestore_end(tuplestore[i]); + tuplestore[i] = NULL; + } + else + { + nstores++; + /* + * Attempt to dump tuples from the store require tuple slot + * allocation, that is not a cheap operation, so proceed if + * target queue has enough space. + */ + if (QUEUE_FREE_SPACE(cstate) > cstate->cs_qlength / 2) + { + if (tmpslot == NULL) + tmpslot = MakeSingleTupleTableSlot(tupDesc); + if (SharedQueueDump(squeue, i, tmpslot, tuplestore[i])) + { + tuplestore_end(tuplestore[i]); + tuplestore[i] = NULL; + cstate->cs_status = CONSUMER_EOF; + nstores--; + } + /* Consumer may be sleeping, wake it up */ + SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch); + } + } + } + else + { + /* it set eof if not yet set */ + if (cstate->cs_status == CONSUMER_ACTIVE) + { + cstate->cs_status = CONSUMER_EOF; + SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch); + } + } + LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); + } + if (tmpslot) + ExecDropSingleTupleTableSlot(tmpslot); + +#ifdef SQUEUE_STAT + squeue->stat_finish = true; +#endif + + return nstores; +} + + +/* + * SharedQueueUnBind + * Cancel binding of current process to the shared queue. If the process + * was a producer it should pass in the array of tuplestores where tuples were + * queueed when it was unsafe to block. If any of the tuplestores holds data + * rows they are written to the queue. The length of the array of the + * tuplestores should be the same as the count of consumers. It is OK if some + * entries are NULL. When a consumer unbinds from the shared queue it should + * set the tuplestore parameter to NULL. + */ +void +SharedQueueUnBind(SharedQueue squeue) +{ + SQueueSync *sqsync = squeue->sq_sync; + int wait_result = 0; + + /* loop while there are active consumers */ + for (;;) + { + int i; + int c_count = 0; + + /* check queue states */ + for (i = 0; i < squeue->sq_nconsumers; i++) + { + ConsState *cstate = &squeue->sq_consumers[i]; + LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); + /* is consumer working yet ? */ + if (cstate->cs_status == CONSUMER_ACTIVE) + cstate->cs_status = CONSUMER_ERROR; + if (cstate->cs_status != CONSUMER_DONE) + { + c_count++; + /* Wake up consumer if it is sleeping */ + SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch); + /* producer will continue waiting */ + ResetLatch(&sqsync->sqs_producer_latch); + } +#ifdef SQUEUE_STAT + else + elog(LOG, "Done %s node %d, %ld writes and %ld reads, %ld buffer writes, %ld buffer reads, %ld tuples returned to buffer", + squeue->sq_key, cstate->cs_node, cstate->stat_writes, cstate->stat_reads, cstate->stat_buff_writes, cstate->stat_buff_reads, cstate->stat_buff_returns); +#endif + + LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); + } + if (c_count == 0) + break; + elog(LOG, "Wait while %d squeue readers finishing", c_count); + /* wait for a notification */ + wait_result = WaitLatch(&sqsync->sqs_producer_latch, + WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_TIMEOUT, + 10000L); + if (wait_result & WL_TIMEOUT) + break; + /* got notification, continue loop */ + } +#ifdef SQUEUE_STAT + elog(LOG, "Producer %s is done, there were %ld pauses", squeue->sq_key, squeue->stat_paused); +#endif + + LWLockAcquire(SQueuesLock, LW_EXCLUSIVE); + /* All is done, clean up */ + DisownLatch(&sqsync->sqs_producer_latch); + + /* Now it is OK to remove hash table entry */ + squeue->sq_sync = NULL; + sqsync->queue = NULL; + if (hash_search(SharedQueues, squeue->sq_key, HASH_REMOVE, NULL) != squeue) + elog(PANIC, "Shared queue data corruption"); + + LWLockRelease(SQueuesLock); + elog(LOG, "Finalized squeue"); + if (wait_result & WL_TIMEOUT) + elog(FATAL, "Timeout while waiting for Consumers finishing"); +} + + +/* + * If queue with specified name still exists set mark respective consumer as + * "Done". Due to executor optimization consumer may never connect the queue, + * and should allow producer to finish it up if it is known the consumer will + * never connect. + */ +void +SharedQueueRelease(const char *sqname) +{ + bool found; + volatile SharedQueue sq; + + elog(LOG, "Shared Queue release: %s", sqname); + + LWLockAcquire(SQueuesLock, LW_EXCLUSIVE); + + sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_FIND, &found); + if (found) + { + volatile SQueueSync *sqsync = sq->sq_sync; + int myid; /* Node Id of the parent data node */ + int i; + char ntype = PGXC_NODE_DATANODE; + + Assert(sqsync && sqsync->queue == sq); + + /* + * Case if the shared queue was never bound. + * Just remove it from the hash table. + */ + if (sq->sq_nodeid == -1) + { + sq->sq_sync = NULL; + sqsync->queue = NULL; + if (hash_search(SharedQueues, sqname, HASH_REMOVE, NULL) != sq) + elog(PANIC, "Shared queue data corruption"); + elog(LOG, "Finalized squeue %s", sqname); + LWLockRelease(SQueuesLock); + return; + } + + myid = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE, &ntype); + /* + * Do not bother releasing producer, all necessary work will be + * done upon UnBind. + */ + if (sq->sq_nodeid != myid) + { + elog(LOG, "Looking for consumer %d in %s", myid, sqname); + /* find specified node in the consumer lists */ + for (i = 0; i < sq->sq_nconsumers; i++) + { + ConsState *cstate = &(sq->sq_consumers[i]); + if (cstate->cs_node == myid) + { + LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, + LW_EXCLUSIVE); + if (cstate->cs_status != CONSUMER_DONE) + { + /* Inform producer the consumer have done the job */ + cstate->cs_status = CONSUMER_DONE; + /* no need to receive notifications */ + if (cstate->cs_pid > 0) + { + DisownLatch(&sqsync->sqs_consumer_sync[i].cs_latch); + cstate->cs_pid = 0; + } + /* + * notify the producer, it may be waiting while + * consumers are finishing + */ + SetLatch(&sqsync->sqs_producer_latch); + elog(LOG, "Release consumer %d of %s", i, sqname); + } + LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); + /* exit */ + LWLockRelease(SQueuesLock); + return; + } + } + /* + * The consumer was never bound. Find empty consumer slot and + * register node here to let producer know that the node will never + * be consuming. + */ + for (i = 0; i < sq->sq_nconsumers; i++) + { + ConsState *cstate = &(sq->sq_consumers[i]); + if (cstate->cs_node == -1) + { + LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, + LW_EXCLUSIVE); + /* Inform producer the consumer have done the job */ + cstate->cs_status = CONSUMER_DONE; + SetLatch(&sqsync->sqs_producer_latch); + elog(LOG, "Release not bound consumer %d of %s", i, sqname); + LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); + } + } + } + } + LWLockRelease(SQueuesLock); +} + + +/* + * Called when the backend is ending. + */ +void +SharedQueuesCleanup(int code, Datum arg) +{ + /* Need to be able to look into catalogs */ + CurrentResourceOwner = ResourceOwnerCreate(NULL, "SharedQueuesCleanup"); + + /* + * Release all registered prepared statements. + * If a shared queue name is associated with the statement this queue will + * be released. + */ + DropAllPreparedStatements(); + + /* Release everything */ + ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, true, true); + ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_LOCKS, true, true); + ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_AFTER_LOCKS, true, true); + CurrentResourceOwner = NULL; +} + + +/* + * sq_push_long_tuple + * Routine to push through the consumer state tuple longer the the consumer + * queue. Long tuple is written by a producer partially, and only when the + * consumer queue is empty. + * The consumer can determine that the tuple being read is long if the length + * of the tuple which is read before data is exceeding queue length. + * Consumers is switching to the long tuple mode and read in the portion of + * data which is already in the queue. After reading in each portion of data + * consumer sets cs_ntuples to LONG_TUPLE to indicate it is in long tuple + * mode, and writes out number of already read bytes to the beginning of the + * queue. + * While Consumer is reading in tuple data Producer may work on other task: + * execute query and send tuples to other Customers. If Producer sees the + * LONG_TUPLE indicator it may write out next portion. The tuple remains + * current in the tuplestore, and Producer just needs to read offset from + * the buffer to know what part of data to write next. + * After tuple is completely written the Producer is advancing to next tuple + * and continue operation in normal mode. + */ +static bool +sq_push_long_tuple(ConsState *cstate, RemoteDataRow datarow) +{ + if (cstate->cs_ntuples == 0) + { + /* the tuple is too big to fit the queue, start pushing it through */ + int len; + /* + * Output actual message size, to prepare consumer: + * allocate memory and set up transmission. + */ + QUEUE_WRITE(cstate, sizeof(int), (char *) &datarow->msglen); + /* Output as much as possible */ + len = cstate->cs_qlength - sizeof(int); + Assert(datarow->msglen > len); + QUEUE_WRITE(cstate, len, datarow->msg); + cstate->cs_ntuples = 1; + return false; + } + else + { + int offset; + int len; + + /* Continue pushing through long tuple */ + Assert(cstate->cs_ntuples == LONG_TUPLE); + /* + * Consumer outputs number of bytes already read at the beginning of + * the queue. + */ + memcpy(&offset, cstate->cs_qstart, sizeof(int)); + + Assert(offset > 0 && offset < datarow->msglen); + + /* remaining data */ + len = datarow->msglen - offset; + /* + * We are sending remaining lengs just for sanity check at the consumer + * side + */ + QUEUE_WRITE(cstate, sizeof(int), (char *) &len); + if (len > cstate->cs_qlength - sizeof(int)) + { + /* does not fit yet */ + len = cstate->cs_qlength - sizeof(int); + QUEUE_WRITE(cstate, len, datarow->msg + offset); + cstate->cs_ntuples = 1; + return false; + } + else + { + /* now we are done */ + QUEUE_WRITE(cstate, len, datarow->msg + offset); + cstate->cs_ntuples = 1; + return true; + } + } +} + + +/* + * sq_pull_long_tuple + * Read in from the queue data of a long tuple which does not the queue. + * See sq_push_long_tuple for more details + */ +static void +sq_pull_long_tuple(ConsState *cstate, RemoteDataRow datarow, + ConsumerSync *sync) +{ + int offset = 0; + int len = datarow->msglen; + + for (;;) + { + /* determine how many bytes to read */ + if (len > cstate->cs_qlength - sizeof(int)) + len = cstate->cs_qlength - sizeof(int); + + /* read data */ + QUEUE_READ(cstate, len, datarow->msg + offset); + + /* remember how many we read already */ + offset += len; + + /* check if we are done */ + if (offset == datarow->msglen) + return; + + /* need more, set up queue to accept data from the producer */ + Assert(cstate->cs_ntuples == 1); /* allow exactly one incomplete tuple */ + cstate->cs_ntuples = LONG_TUPLE; /* long tuple mode marker */ + /* Inform producer how many bytes we have already */ + memcpy(cstate->cs_qstart, &offset, sizeof(int)); + /* Release locks and wait until producer supply more data */ + while (cstate->cs_ntuples == LONG_TUPLE) + { + /* prepare wait */ + ResetLatch(&sync->cs_latch); + LWLockRelease(sync->cs_lwlock); + /* Wait for notification about available info */ + WaitLatch(&sync->cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1); + /* got the notification, restore lock and try again */ + LWLockAcquire(sync->cs_lwlock, LW_EXCLUSIVE); + } + /* Read length of remaining data */ + QUEUE_READ(cstate, sizeof(int), (char *) &len); + + /* Make sure we are doing the same tuple */ + Assert(offset + len == datarow->msglen); + + /* next iteration */ + } +} diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 1cfac9e80b..6086692f81 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -50,6 +50,11 @@ * there is a window (caused by pgstat delay) on which a worker may choose a * table that was already vacuumed; this is a bug in the current design. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -79,6 +84,10 @@ #include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" +#ifdef XCP +#include "pgxc/pgxc.h" +#include "pgxc/pgxcnode.h" +#endif #include "postmaster/autovacuum.h" #include "postmaster/fork_process.h" #include "postmaster/postmaster.h" @@ -2150,6 +2159,16 @@ do_autovacuum(void) heap_endscan(relScan); heap_close(classRel, AccessShareLock); +#ifdef XCP + /* + * Coordinator needs to access Datanodes to process distributed table. + */ + if (IS_PGXC_COORDINATOR) + { + InitMultinodeExecutor(false); + } +#endif + /* * Create a buffer access strategy object for VACUUM to use. We want to * use the same one across all the vacuum operations we perform, since the diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 73d5b2e39c..a085f27d9f 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -11,6 +11,11 @@ * - Add a pgstat config column to pg_database, so this * entire thing can be enabled/disabled on a per db basis. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Copyright (c) 2001-2012, PostgreSQL Global Development Group * * src/backend/postmaster/pgstat.c @@ -1804,6 +1809,72 @@ pgstat_update_heap_dead_tuples(Relation rel, int delta) } +#ifdef XCP +/* + * pgstat_count_remote_insert - count insertion of n tuples on remote Datanodes + */ +void +pgstat_count_remote_insert(Relation rel, int n) +{ + /* Should be only applied to distributed table */ + Assert(rel->rd_locator_info); + + /* For now use the same counters as for heap insert */ + pgstat_count_heap_insert(rel, n); +} + + +/* + * pgstat_count_remote_update - count update of n tuples on remote Datanodes + */ +void +pgstat_count_remote_update(Relation rel, int n) +{ + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + + /* Should be only applied to distributed table */ + Assert(rel->rd_locator_info); + + if (pgstat_info != NULL) + { + /* We have to log the effect at the proper transactional level */ + int nest_level = GetCurrentTransactionNestLevel(); + + if (pgstat_info->trans == NULL || + pgstat_info->trans->nest_level != nest_level) + add_tabstat_xact_level(pgstat_info, nest_level); + + pgstat_info->trans->tuples_updated += n; + } +} + + +/* + * pgstat_count_remote_delete - count delete of n tuples on remote Datanodes + */ +void +pgstat_count_remote_delete(Relation rel, int n) +{ + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + + /* Should be only applied to distributed table */ + Assert(rel->rd_locator_info); + + if (pgstat_info != NULL) + { + /* We have to log the effect at the proper transactional level */ + int nest_level = GetCurrentTransactionNestLevel(); + + if (pgstat_info->trans == NULL || + pgstat_info->trans->nest_level != nest_level) + add_tabstat_xact_level(pgstat_info, nest_level); + + pgstat_info->trans->tuples_deleted += n; + } +} +#endif + + /* ---------- * AtEOXact_PgStat * diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 9069a59fce..4d6972ebaf 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -32,6 +32,11 @@ * clients. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -340,7 +345,11 @@ static DNSServiceRef bonjour_sdref = NULL; #ifdef PGXC char *PGXCNodeName = NULL; +#ifdef XCP +int PGXCNodeId = 0; +#else int PGXCNodeId = -1; +#endif /* * When a particular node starts up, store the node identifier in this variable * so that we dont have to calculate it OR do a search in cache any where else @@ -348,8 +357,10 @@ int PGXCNodeId = -1; */ uint32 PGXCNodeIdentifier = 0; +#ifndef XCP static bool isNodeRegistered = false; #endif +#endif /* * postmaster.c - function prototypes @@ -495,11 +506,35 @@ static void ShmemBackendArrayAdd(Backend *bn); static void ShmemBackendArrayRemove(Backend *bn); #endif /* EXEC_BACKEND */ +#ifdef XCP +char *parentPGXCNode = NULL; +#endif + #ifdef PGXC bool isPGXCCoordinator = false; bool isPGXCDataNode = false; + +/* + * While adding a new node to the cluster we need to restore the schema of + * an existing database to the new node. + * If the new node is a datanode and we connect directly to it, + * it does not allow DDL, because it is in read only mode & + * If the new node is a coordinator it will send DDLs to all the other + * coordinators which we do not want it to do + * To provide ability to restore on the new node a new command line + * argument is provided called --restoremode + * It is to be provided in place of --coordinator OR --datanode. + * In restore mode both coordinator and datanode are internally + * treated as a datanode. + */ +bool isRestoreMode = false; + int remoteConnType = REMOTE_CONN_APP; +/* key pair to be used as object id while using advisory lock for backup */ +Datum xc_lockForBackupKey1; +Datum xc_lockForBackupKey2; + #define StartPoolManager() StartChildProcess(PoolerProcess) #endif @@ -740,6 +775,15 @@ PostmasterMain(int argc, char *argv[]) else if (strcmp(name, "datanode") == 0 && !value) isPGXCDataNode = true; + else if (strcmp(name, "restoremode") == 0 && !value) + { + /* + * In restore mode both coordinator and datanode + * are internally treeated as datanodes + */ + isRestoreMode = true; + isPGXCDataNode = true; + } else /* default case */ { #endif @@ -777,7 +821,11 @@ PostmasterMain(int argc, char *argv[]) #ifdef PGXC if (!IS_PGXC_COORDINATOR && !IS_PGXC_DATANODE) { - write_stderr("%s: Postgres-XC: must start as either a Coordinator (--coordinator) or Datanode (--datanode)\n", +#ifdef XCP + write_stderr("%s: Postgres-XL: must start as either a Coordinator (--coordinator) or Data Node (--datanode)\n", +#else + write_stderr("%s: Postgres-XC: must start as either a Coordinator (--coordinator) or Data Node (--datanode)\n", +#endif progname); ExitPostmaster(1); } @@ -1181,6 +1229,16 @@ PostmasterMain(int argc, char *argv[]) pmState = PM_STARTUP; #ifdef PGXC /* PGXC_COORD */ +#ifdef XCP + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Initialize the Data Node connection pool + */ + PgPoolerPID = StartPoolManager(); + + MemoryContextSwitchTo(oldcontext); +#else if (IS_PGXC_COORDINATOR) { oldcontext = MemoryContextSwitchTo(TopMemoryContext); @@ -1192,7 +1250,8 @@ PostmasterMain(int argc, char *argv[]) MemoryContextSwitchTo(oldcontext); } -#endif +#endif /* XCP */ +#endif /* PGXC */ status = ServerLoop(); @@ -1496,11 +1555,15 @@ ServerLoop(void) if (PgStatPID == 0 && pmState == PM_RUN) PgStatPID = pgstat_start(); -#ifdef PGXC /* PGXC_COORD */ +#ifdef PGXC /* If we have lost the pooler, try to start a new one */ +#ifdef XCP + if (PgPoolerPID == 0 && pmState == PM_RUN) +#else if (IS_PGXC_COORDINATOR && PgPoolerPID == 0 && pmState == PM_RUN) +#endif /* XCP */ PgPoolerPID = StartPoolManager(); -#endif +#endif /* PGXC */ /* If we need to signal the autovacuum launcher, do so now */ if (avlauncher_needs_signal) { @@ -2147,9 +2210,13 @@ SIGHUP_handler(SIGNAL_ARGS) if (StartupPID != 0) signal_child(StartupPID, SIGHUP); #ifdef PGXC /* PGXC_COORD */ +#ifdef XCP + if (PgPoolerPID != 0) +#else if (IS_PGXC_COORDINATOR && PgPoolerPID != 0) +#endif /* XCP */ signal_child(PgPoolerPID, SIGHUP); -#endif +#endif /* PGXC */ if (BgWriterPID != 0) signal_child(BgWriterPID, SIGHUP); if (CheckpointerPID != 0) @@ -2232,9 +2299,14 @@ pmdie(SIGNAL_ARGS) #ifdef PGXC /* PGXC_COORD */ /* and the pool manager too */ +#ifdef XCP + if (PgPoolerPID != 0) +#else if (IS_PGXC_COORDINATOR && PgPoolerPID != 0) +#endif signal_child(PgPoolerPID, SIGTERM); +#ifndef XCP /* Unregister Node on GTM */ if (isNodeRegistered) { @@ -2244,6 +2316,7 @@ pmdie(SIGNAL_ARGS) UnregisterGTM(GTM_NODE_DATANODE); } #endif +#endif /* * If we're in recovery, we can't kill the startup process @@ -2286,6 +2359,11 @@ pmdie(SIGNAL_ARGS) signal_child(BgWriterPID, SIGTERM); if (WalReceiverPID != 0) signal_child(WalReceiverPID, SIGTERM); +#ifdef XCP + /* and the pool manager too */ + if (PgPoolerPID != 0) + signal_child(PgPoolerPID, SIGTERM); +#endif /* XCP */ if (pmState == PM_RECOVERY) { /* @@ -2312,7 +2390,8 @@ pmdie(SIGNAL_ARGS) /* and the walwriter too */ if (WalWriterPID != 0) signal_child(WalWriterPID, SIGTERM); -#ifdef PGXC /* PGXC_COORD */ +#ifdef PGXC +#ifndef XCP /* and the pool manager too */ if (IS_PGXC_COORDINATOR && PgPoolerPID != 0) signal_child(PgPoolerPID, SIGTERM); @@ -2325,7 +2404,8 @@ pmdie(SIGNAL_ARGS) else if (IS_PGXC_DATANODE) UnregisterGTM(GTM_NODE_DATANODE); } -#endif +#endif /* XCP */ +#endif /* PGXC */ pmState = PM_WAIT_BACKENDS; } @@ -2350,7 +2430,11 @@ pmdie(SIGNAL_ARGS) if (StartupPID != 0) signal_child(StartupPID, SIGQUIT); #ifdef PGXC /* PGXC_COORD */ +#ifdef XCP + if (PgPoolerPID != 0) +#else if (IS_PGXC_COORDINATOR && PgPoolerPID != 0) +#endif /* XCP */ signal_child(PgPoolerPID, SIGQUIT); #endif @@ -2515,10 +2599,14 @@ reaper(SIGNAL_ARGS) PgArchPID = pgarch_start(); if (PgStatPID == 0) PgStatPID = pgstat_start(); -#ifdef PGXC /* PGXC_COORD */ +#ifdef PGXC +#ifdef XCP + if (PgPoolerPID == 0) +#else if (IS_PGXC_COORDINATOR && PgPoolerPID == 0) +#endif /* XCP */ PgPoolerPID = StartPoolManager(); -#endif +#endif /* PGXC */ /* at this point we are really open for business */ ereport(LOG, @@ -2691,7 +2779,11 @@ reaper(SIGNAL_ARGS) * Was it the pool manager? TODO decide how to handle * Probably we should restart the system */ +#ifdef XCP + if (pid == PgPoolerPID) +#else if (IS_PGXC_COORDINATOR && pid == PgPoolerPID) +#endif /* XCP */ { PgPoolerPID = 0; if (!EXIT_STATUS_0(exitstatus)) @@ -2932,8 +3024,20 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT)); } -#ifdef PGXC /* PGXC_COORD */ +#ifdef PGXC /* Take care of the pool manager too */ +#ifdef XCP + if (pid == PgPoolerPID) + PgPoolerPID = 0; + else if (PgPoolerPID != 0 && !FatalError) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) PgPoolerPID))); + signal_child(PgPoolerPID, (SendStop ? SIGSTOP : SIGQUIT)); + } +#else if (IS_PGXC_COORDINATOR) { if (pid == PgPoolerPID) @@ -2947,7 +3051,8 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) signal_child(PgPoolerPID, (SendStop ? SIGSTOP : SIGQUIT)); } } -#endif +#endif /* XCP */ +#endif /* PGXC */ /* * Force a power-cycle of the pgarch process too. (This isn't absolutely @@ -3120,7 +3225,7 @@ PostmasterStateMachine(void) */ if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC) == 0 && StartupPID == 0 && -#ifdef PGXC /* PGXC_COORD */ +#ifdef PGXC PgPoolerPID == 0 && #endif WalReceiverPID == 0 && @@ -3218,7 +3323,7 @@ PostmasterStateMachine(void) PgArchPID == 0 && PgStatPID == 0) { /* These other guys should be dead already */ -#ifdef PGXC /* PGXC_COORD */ +#ifdef PGXC Assert(PgPoolerPID == 0); #endif Assert(StartupPID == 0); @@ -4430,6 +4535,7 @@ sigusr1_handler(SIGNAL_ARGS) } #ifdef PGXC +#ifndef XCP /* * Register node to GTM. * A node can only be registered if it has reached a stable recovery state @@ -4475,6 +4581,7 @@ sigusr1_handler(SIGNAL_ARGS) } } #endif +#endif if (CheckPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER) && PgArchPID != 0) diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 4c64d4d0b1..a3d525cdad 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -32,6 +32,7 @@ #include "pgxc/locator.h" #include "pgxc/nodemgr.h" #include "pgxc/pgxc.h" +#include "pgxc/postgresql_fdw.h" #include "nodes/nodes.h" #include "optimizer/planner.h" #include "optimizer/var.h" @@ -1330,7 +1331,7 @@ rewriteTargetListUD(Query *parsetree, RangeTblEntry *target_rte, */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && - !IsLocatorReplicated(GetLocatorType(RelationGetRelid(target_relation)))) + !IsLocatorReplicated(GetRelationLocType(RelationGetRelid(target_relation)))) { var = makeVar(parsetree->resultRelation, XC_NodeIdAttributeNumber, @@ -2692,7 +2693,7 @@ QueryRewriteCTAS(Query *parsetree) cparsetree->utilityStmt = (Node *) create_stmt; initStringInfo(&cquery); - deparse_query(cparsetree, &cquery, NIL, false, false); + deparse_query(cparsetree, &cquery, NIL); /* Finally, fire off the query to run the DDL */ ProcessUtility(cparsetree->utilityStmt, cquery.data, NULL, true, NULL, @@ -2707,7 +2708,7 @@ QueryRewriteCTAS(Query *parsetree) /* Get the SELECT query string */ initStringInfo(&cquery); - deparse_query((Query *)stmt->query, &cquery, NIL, true, false); + deparse_query((Query *)stmt->query, &cquery, NIL); selectstr = pstrdup(cquery.data); /* Now, finally build the INSERT INTO statement */ diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 78145472e1..6858f1ee80 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3,6 +3,11 @@ * bufmgr.c * buffer manager interface routines * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -2048,8 +2053,12 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum, { int i; +#ifdef XCP + if (!OidIsValid(MyCoordId) && rnode.backend != InvalidBackendId) +#else /* If it's a local relation, it's localbuf.c's problem. */ if (rnode.backend != InvalidBackendId) +#endif { if (rnode.backend == MyBackendId) DropRelFileNodeLocalBuffers(rnode.node, forkNum, firstDelBlock); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 3c89dcad98..78219b8f53 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -3,6 +3,11 @@ * ipci.c * POSTGRES inter-process communication initialization code. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -40,7 +45,11 @@ #include "storage/procsignal.h" #include "storage/sinvaladt.h" #include "storage/spin.h" - +#ifdef XCP +#include "pgxc/pgxc.h" +#include "pgxc/squeue.h" +#include "pgxc/pause.h" +#endif shmem_startup_hook_type shmem_startup_hook = NULL; @@ -126,6 +135,12 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) size = add_size(size, AutoVacuumShmemSize()); size = add_size(size, WalSndShmemSize()); size = add_size(size, WalRcvShmemSize()); +#ifdef XCP + if (IS_PGXC_DATANODE) + size = add_size(size, SharedQueueShmemSize()); + if (IS_PGXC_COORDINATOR) + size = add_size(size, ClusterLockShmemSize()); +#endif size = add_size(size, BTreeShmemSize()); size = add_size(size, SyncScanShmemSize()); size = add_size(size, AsyncShmemSize()); @@ -236,6 +251,16 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) WalSndShmemInit(); WalRcvShmemInit(); +#ifdef XCP + /* + * Set up distributed executor's shared queues + */ + if (IS_PGXC_DATANODE) + SharedQueuesInit(); + if (IS_PGXC_COORDINATOR) + ClusterLockShmemInit(); +#endif + /* * Set up other modules that need some shared memory space */ diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index e7f7e6b3ca..34ac658a00 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -18,23 +18,6 @@ * at need by checking for pid == 0. * #ifdef PGXC - * Vanilla PostgreSQL assumes maximum TransactinIds in any snapshot is - * arrayP->maxProcs. It does not apply to XC because XC's snapshot - * should include XIDs running in other node, which may come at any - * time. This means that needed size of xip varies from time to time. - * - * This must be handled properly in all the functions in this module. - * - * The member max_xcnt was added as SnapshotData member to indicate the - * real size of xip array. - * - * Here, the following assumption is made for SnapshotData struct throughout - * this module. - * - * 1. xip member physical size is indicated by max_xcnt member. - * 2. If max_xcnt == 0, it means that xip members is NULL, and vise versa. - * 3. xip (and subxip) are allocated usign malloc() or realloc() directly. - * * For Postgres-XC, there is some special handling for ANALYZE. * An XID for a local ANALYZE command will never involve other nodes. * Also, ANALYZE may run for a long time, affecting snapshot xmin values @@ -58,6 +41,11 @@ * happen, it would tie up KnownAssignedXids indefinitely, so we protect * ourselves by pruning the array when a valid list of running XIDs arrives. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -86,7 +74,6 @@ #include "pgxc/pgxc.h" #include "access/gtm.h" #include "storage/ipc.h" -#include "pgxc/nodemgr.h" /* PGXC_DATANODE */ #include "postmaster/autovacuum.h" #endif @@ -121,9 +108,6 @@ typedef struct ProcArrayStruct * but actually it is maxProcs entries long. */ int pgprocnos[1]; /* VARIABLE LENGTH ARRAY */ -#ifdef PGXC - int pgAVproxnos[1]; /* VARIABLE LENGTH ARRAY */ -#endif } ProcArrayStruct; static ProcArrayStruct *procArray; @@ -196,10 +180,6 @@ void UnsetGlobalSnapshotData(void); static bool GetPGXCSnapshotData(Snapshot snapshot); static bool GetSnapshotDataDataNode(Snapshot snapshot); static bool GetSnapshotDataCoordinator(Snapshot snapshot); -static bool resizeXip(Snapshot snapshot, int newsize); -static bool resizeSubxip(Snapshot snapshot, int newsize); -static void cleanSnapshot(Snapshot snapshot); - /* Global snapshot data */ static SnapshotSource snapshot_source = SNAPSHOT_UNDEFINED; static int gxmin = InvalidTransactionId; @@ -253,13 +233,8 @@ ProcArrayShmemSize(void) * standby in the current run, but we don't know that yet at the time * shared memory is being set up. */ -#if 0 /* Reamins this code for the test to disable KnownAssignedXids in the slave */ -#define TOTAL_MAX_CACHED_SUBXIDS \ - (((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS) * (MaxCoords + MaxDataNodes)) -#else #define TOTAL_MAX_CACHED_SUBXIDS \ ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS) -#endif if (EnableHotStandby) { @@ -1345,8 +1320,6 @@ GetSnapshotData(Snapshot snapshot) int subcount = 0; bool suboverflowed = false; - Assert(snapshot != NULL); - #ifdef PGXC /* PGXC_DATANODE */ /* * Obtain a global snapshot for a Postgres-XC session @@ -1354,18 +1327,22 @@ GetSnapshotData(Snapshot snapshot) */ if (GetPGXCSnapshotData(snapshot)) return snapshot; +#ifdef XCP /* - * The codes below run when GetPGXCSnapshotData() couldn't get snapshot from - * GTM. So no data in snapshot will be used. + * Making falling back stricter */ - cleanSnapshot(snapshot); + if (!snapshot && !RecoveryInProgress() && IsPostmasterEnvironment && + IsNormalProcessingMode() && !IsAutoVacuumLauncherProcess()) + elog(ERROR, "Was unable to obtain a snapshot from GTM."); +#else +#endif #endif /* * Fallback to standard routine, calculate snapshot from local proc arrey * if no master connection */ - + Assert(snapshot != NULL); /* * Allocating space for maxProcs xids is usually overkill; numProcs would @@ -1380,10 +1357,6 @@ GetSnapshotData(Snapshot snapshot) */ if (snapshot->xip == NULL) { -#ifdef PGXC - resizeXip(snapshot, arrayP->maxProcs); - resizeSubxip(snapshot, TOTAL_MAX_CACHED_SUBXIDS); -#else /* * First call for this snapshot. Snapshot is same size whether or not * we are in recovery, see later comments. @@ -1401,7 +1374,6 @@ GetSnapshotData(Snapshot snapshot) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); -#endif } /* @@ -1478,9 +1450,6 @@ GetSnapshotData(Snapshot snapshot) continue; /* Add XID to snapshot. */ -#ifdef PGXC - resizeXip(snapshot, count + 1); -#endif snapshot->xip[count++] = xid; /* @@ -2696,12 +2665,12 @@ DisplayXidCache(void) void SetGlobalSnapshotData(int xmin, int xmax, int xcnt, int *xip) { + if (gxip) + free(gxip); snapshot_source = SNAPSHOT_COORDINATOR; gxmin = xmin; gxmax = xmax; gxcnt = xcnt; - if (gxip) - free(gxip); gxip = xip; elog (DEBUG1, "global snapshot info: gxmin: %d, gxmax: %d, gxcnt: %d", gxmin, gxmax, gxcnt); } @@ -2712,12 +2681,12 @@ SetGlobalSnapshotData(int xmin, int xmax, int xcnt, int *xip) void UnsetGlobalSnapshotData(void) { + if (gxip) + free(gxip); snapshot_source = SNAPSHOT_UNDEFINED; gxmin = InvalidTransactionId; gxmax = InvalidTransactionId; gxcnt = 0; - if (gxip) - free(gxip); gxip = NULL; elog (DEBUG1, "unset snapshot info"); } @@ -2745,14 +2714,45 @@ GetPGXCSnapshotData(Snapshot snapshot) * GTM not to include this transaction ID in snapshot. * A vacuum worker starts as a normal transaction would. */ +#ifdef XCP + /* If we got the transaction id from GTM, we should get the snapshot + * from there, too + */ + if (IS_PGXC_DATANODE || IsConnFromCoord() || IsAutoVacuumWorkerProcess() || IsXidFromGTM) +#else if (IS_PGXC_DATANODE || IsConnFromCoord() || IsAutoVacuumWorkerProcess()) +#endif { if (GetSnapshotDataDataNode(snapshot)) return true; /* else fallthrough */ + else +#ifdef XCP + { + if (IsAutoVacuumLauncherProcess() || !IsNormalProcessingMode() || !IsPostmasterEnvironment) + { +#endif + elog(LOG, "Will fall back to local snapshot for XID = %d, source = %d, gxmin = %d, autovac launch = %d, autovac = %d, normProcMode = %d, postEnv = %d", + GetCurrentTransactionId(), snapshot_source, gxmin, + IsAutoVacuumLauncherProcess(), IsAutoVacuumWorkerProcess(), + IsNormalProcessingMode(), IsPostmasterEnvironment); +#ifdef XCP + } + else + { + elog(ERROR, "GTM error, no fallback, could not obtain snapshot. Current XID = %d, Autovac = %d", GetCurrentTransactionId(), IsAutoVacuumWorkerProcess()); + } + } +#endif } else if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && IsNormalProcessingMode()) { +#ifdef XCP + /* + * GetSnapshotDataCoordinator will always fail if there is a GTM error. + * There is no need for special checking + */ +#endif /* Snapshot has ever been received from remote Coordinator */ if (GetSnapshotDataCoordinator(snapshot)) return true; @@ -2770,13 +2770,17 @@ GetPGXCSnapshotData(Snapshot snapshot) * IsNormalProcessingMode() - checks for new connections * IsAutoVacuumLauncherProcess - checks for autovacuum launcher process */ - if (IS_PGXC_DATANODE && + if (IS_PGXC_DATANODE && !isRestoreMode && snapshot_source == SNAPSHOT_UNDEFINED && IsPostmasterEnvironment && IsNormalProcessingMode() && !IsAutoVacuumLauncherProcess()) { +#ifdef XCP + elog(ERROR, "Do not have a GTM snapshot available"); +#else elog(WARNING, "Do not have a GTM snapshot available"); +#endif } return false; @@ -2791,7 +2795,11 @@ GetPGXCSnapshotData(Snapshot snapshot) static bool GetSnapshotDataDataNode(Snapshot snapshot) { +#ifdef XCP + Assert(IS_PGXC_DATANODE || IsConnFromCoord() || IsAutoVacuumWorkerProcess() || IsXidFromGTM); +#else Assert(IS_PGXC_DATANODE || IsConnFromCoord() || IsAutoVacuumWorkerProcess()); +#endif /* * Fallback to general case if Datanode is accessed directly by an application @@ -2800,35 +2808,42 @@ GetSnapshotDataDataNode(Snapshot snapshot) return GetSnapshotDataCoordinator(snapshot); /* Have a look at cases where Datanode is accessed by cluster internally */ +#ifdef XCP + if (IsAutoVacuumWorkerProcess() || GetForceXidFromGTM() || IsAutoVacuumLauncherProcess() || IsXidFromGTM) +#else if (IsAutoVacuumWorkerProcess() || GetForceXidFromGTM()) +#endif { GTM_Snapshot gtm_snapshot; bool canbe_grouped = (!FirstSnapshotSet) || (!IsolationUsesXactSnapshot()); elog(DEBUG1, "Getting snapshot for autovacuum. Current XID = %d", GetCurrentTransactionId()); gtm_snapshot = GetSnapshotGTM(GetCurrentTransactionId(), canbe_grouped); - if (!gtm_snapshot) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("GTM error, could not obtain snapshot"))); +#ifdef XCP + errmsg("GTM error, could not obtain snapshot. Current XID = %d, Autovac = %d", GetCurrentTransactionId(), IsAutoVacuumWorkerProcess()))); +#else + errmsg("GTM error, could not obtain snapshot."); +#endif else { + if (gxip) + free(gxip); snapshot_source = SNAPSHOT_DIRECT; gxmin = gtm_snapshot->sn_xmin; gxmax = gtm_snapshot->sn_xmax; gxcnt = gtm_snapshot->sn_xcnt; RecentGlobalXmin = gtm_snapshot->sn_recent_global_xmin; - if (gxip) - free(gxip); if (gxcnt > 0) { - gxip = malloc(gxcnt * 4); + gxip = malloc(gxcnt * sizeof(int)); if (gxip == NULL) { ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } - memcpy(gxip, gtm_snapshot->sn_xip, gxcnt * 4); + memcpy(gxip, gtm_snapshot->sn_xip, gxcnt * sizeof(int)); } else gxip = NULL; @@ -2856,8 +2871,38 @@ GetSnapshotDataDataNode(Snapshot snapshot) * maxProcs does not change at runtime, we can simply reuse the previous * xip arrays if any. (This relies on the fact that all callers pass * static SnapshotData structs.) */ - resizeXip(snapshot, Max(arrayP->maxProcs, gxcnt)); - resizeSubxip(snapshot, PGPROC_MAX_CACHED_SUBXIDS); + if (snapshot->xip == NULL) + { + ProcArrayStruct *arrayP = procArray; + /* + * First call for this snapshot + */ + snapshot->xip = (TransactionId *) + malloc(Max(arrayP->maxProcs, gxcnt) * sizeof(TransactionId)); + if (snapshot->xip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + snapshot->max_xcnt = Max(arrayP->maxProcs, gxcnt); + + Assert(snapshot->subxip == NULL); + snapshot->subxip = (TransactionId *) + malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId)); + if (snapshot->subxip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + else if (snapshot->max_xcnt < gxcnt) + { + snapshot->xip = (TransactionId *) + realloc(snapshot->xip, gxcnt * sizeof(TransactionId)); + if (snapshot->xip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + snapshot->max_xcnt = gxcnt; + } memcpy(snapshot->xip, gxip, gxcnt * sizeof(TransactionId)); snapshot->curcid = GetCurrentCommandId(false); @@ -2930,7 +2975,17 @@ GetSnapshotDataDataNode(Snapshot snapshot) continue; if (proc != MyProc) { - resizeXip(snapshot, snapshot->xcnt+1); + if (snapshot->xcnt >= snapshot->max_xcnt) + { + snapshot->max_xcnt += arrayP->numProcs; + + snapshot->xip = (TransactionId *) + realloc(snapshot->xip, snapshot->max_xcnt * sizeof(TransactionId)); + if (snapshot->xip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } snapshot->xip[snapshot->xcnt++] = xid; elog(DEBUG1, "Adding Analyze for xid %d to snapshot", pgxact->xid); } @@ -2978,7 +3033,8 @@ GetSnapshotDataCoordinator(Snapshot snapshot) if (!gtm_snapshot) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("GTM error, could not obtain snapshot"))); + errmsg("GTM error, could not obtain snapshot XID = %d", + GetCurrentTransactionId()))); else { snapshot->xmin = gtm_snapshot->sn_xmin; @@ -2998,10 +3054,44 @@ GetSnapshotDataCoordinator(Snapshot snapshot) * xip arrays if any. (This relies on the fact that all callers pass * static SnapshotData structs.) */ + if (snapshot->xip == NULL) { ProcArrayStruct *arrayP = procArray; - resizeXip(snapshot, Max(arrayP->maxProcs, gtm_snapshot->sn_xcnt)); - resizeSubxip(snapshot, PGPROC_MAX_CACHED_SUBXIDS); + /* + * First call for this snapshot + */ + snapshot->xip = (TransactionId *) + malloc(Max(arrayP->maxProcs, gtm_snapshot->sn_xcnt) * sizeof(TransactionId)); + if (snapshot->xip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + snapshot->max_xcnt = Max(arrayP->maxProcs, gtm_snapshot->sn_xcnt); + + /* + * FIXME + * + * We really don't support subtransaction in PGXC right now, but + * when we would, we should fix the allocation below + */ + Assert(snapshot->subxip == NULL); + snapshot->subxip = (TransactionId *) + malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId)); + + if (snapshot->subxip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + else if (snapshot->max_xcnt < gtm_snapshot->sn_xcnt) + { + snapshot->xip = (TransactionId *) + realloc(snapshot->xip, gtm_snapshot->sn_xcnt * sizeof(TransactionId)); + if (snapshot->xip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + snapshot->max_xcnt = gtm_snapshot->sn_xcnt; } memcpy(snapshot->xip, gtm_snapshot->sn_xip, gtm_snapshot->sn_xcnt * sizeof(TransactionId)); @@ -3031,83 +3121,6 @@ GetSnapshotDataCoordinator(Snapshot snapshot) } return false; } - -/* - * Handlers for xip and subxip member array size, only for XC. - * - * Assumes xip is NULL when max_xcnt == 0 - */ -static bool -resizeXip(Snapshot snapshot, int newsize) -{ -#define xipResizeUnit (64) - newsize = ((newsize + xipResizeUnit - 1)/xipResizeUnit)*xipResizeUnit; - - if (snapshot->max_xcnt == 0) - { - snapshot->xip = malloc(newsize * sizeof(TransactionId)); - if (snapshot->xip == NULL) - { - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - return false; - } - snapshot->max_xcnt = newsize; - snapshot->xcnt = 0; - return true; - } - else if (snapshot->max_xcnt >= newsize) - return true; - else - { - snapshot->xip = realloc(snapshot->xip, newsize * sizeof(TransactionId)); - if (snapshot->xip == NULL) - { - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - return false; - } - snapshot->max_xcnt = newsize; - return true; - } - return false; -} - -/* - * Because XC does not support subtransaction so far, this function allocates - * subxip array with the fixes size of TOTAL_MAX_CACHED_SUBXIDS. This is - * controlled by resizeXip() above. - * If subxip member is not NULL, it assumes subxip array has TOTAL_MAX_CACHED_SUBXIDS - * size, regardless what size is specified. - * This part needs improvement when XC supports subtransaction. - */ -static bool -resizeSubxip(Snapshot snapshot, int newsize) -{ - if (snapshot->subxip) - return true; - snapshot->subxip = (TransactionId *) - malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId)); - if (snapshot->subxip == NULL) - { - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - return false; - } - return true; -} - -/* Cleanup the snapshot */ -static void -cleanSnapshot(Snapshot snapshot) -{ - snapshot->xcnt = 0; - snapshot->subxcnt = 0; - snapshot->xmin = snapshot->xmax = InvalidTransactionId; -} #endif /* PGXC */ /* ---------------------------------------------- @@ -3451,25 +3464,6 @@ static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid, bool exclusive_lock) { -#ifdef PGXC - /* - * Postgres-XC Version 1.0.x supports log shipping replication but not hot standby - * because hot standby needs to provide consistent database views for all the - * datanode, which is not available yet. - * - * On the other hand, in the slave, current KnownAssignedXids ignores latter half - * of XLOG_XACT_ASSIGNMENT wal record and registers all the possible XIDs found - * at the first half of the wal record. Some of them can be missing and such missing - * Xids remain in the buffer, causing overflow and the slave stops. - * - * It will need various change in the code, while the hot standby does not work correctly. - * - * For short term solution for Version 1.0.x, it was determined to disable whole hot - * hot staydby. - * - * Hot standby correction will be done in next major release. - */ -#else /* use volatile pointer to prevent code rearrangement */ volatile ProcArrayStruct *pArray = procArray; TransactionId next_xid; @@ -3574,7 +3568,6 @@ KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid, pArray->headKnownAssignedXids = head; SpinLockRelease(&pArray->known_assigned_xids_lck); } -#endif } /* @@ -3744,25 +3737,6 @@ KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids, static void KnownAssignedXidsRemovePreceding(TransactionId removeXid) { -#ifdef PGXC - /* - * Postgres-XC Version 1.0.x supports log shipping replication but not hot standby - * because hot standby needs to provide consistent database views for all the - * datanode, which is not available yet. - * - * On the other hand, in the slave, current KnownAssignedXids ignores latter half - * of XLOG_XACT_ASSIGNMENT wal record and registers all the possible XIDs found - * at the first half of the wal record. Some of them can be missing and such missing - * Xids remain in the buffer, causing overflow and the slave stops. - * - * It will need various change in the code, while the hot standby does not work correctly. - * - * For short term solution for Version 1.0.x, it was determined to disable whole hot - * hot staydby. - * - * Hot standby correction will be done in next major release. - */ -#else /* use volatile pointer to prevent code rearrangement */ volatile ProcArrayStruct *pArray = procArray; int count = 0; @@ -3828,7 +3802,6 @@ KnownAssignedXidsRemovePreceding(TransactionId removeXid) /* Opportunistically compress the array */ KnownAssignedXidsCompress(false); -#endif } /* @@ -3858,26 +3831,6 @@ static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin, TransactionId xmax) { -#ifdef PGXC - /* - * Postgres-XC Version 1.0.x supports log shipping replication but not hot standby - * because hot standby needs to provide consistent database views for all the - * datanode, which is not available yet. - * - * On the other hand, in the slave, current KnownAssignedXids ignores latter half - * of XLOG_XACT_ASSIGNMENT wal record and registers all the possible XIDs found - * at the first half of the wal record. Some of them can be missing and such missing - * Xids remain in the buffer, causing overflow and the slave stops. - * - * It will need various change in the code, while the hot standby does not work correctly. - * - * For short term solution for Version 1.0.x, it was determined to disable whole hot - * hot staydby. - * - * Hot standby correction will be done in next major release. - */ - return 0; -#else /* use volatile pointer to prevent code rearrangement */ volatile ProcArrayStruct *pArray = procArray; int count = 0; @@ -3928,7 +3881,6 @@ KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin, } return count; -#endif } /* @@ -3938,26 +3890,6 @@ KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin, static TransactionId KnownAssignedXidsGetOldestXmin(void) { -#ifdef PGXC - /* - * Postgres-XC Version 1.0.x supports log shipping replication but not hot standby - * because hot standby needs to provide consistent database views for all the - * datanode, which is not available yet. - * - * On the other hand, in the slave, current KnownAssignedXids ignores latter half - * of XLOG_XACT_ASSIGNMENT wal record and registers all the possible XIDs found - * at the first half of the wal record. Some of them can be missing and such missing - * Xids remain in the buffer, causing overflow and the slave stops. - * - * It will need various change in the code, while the hot standby does not work correctly. - * - * For short term solution for Version 1.0.x, it was determined to disable whole hot - * hot staydby. - * - * Hot standby correction will be done in next major release. - */ - return InvalidTransactionId; -#else /* use volatile pointer to prevent code rearrangement */ volatile ProcArrayStruct *pArray = procArray; int head, @@ -3980,7 +3912,6 @@ KnownAssignedXidsGetOldestXmin(void) } return InvalidTransactionId; -#endif } /* @@ -4028,6 +3959,120 @@ KnownAssignedXidsDisplay(int trace_level) pfree(buf.data); } + +#ifdef XCP +/* + * GetGlobalSessionInfo + * + * Determine the global session id of the specified backend process + * Returns coordinator node_id and pid of the initiating coordinator session. + * If no such backend or global session id is not defined for the backend + * return zero values. + */ +void +GetGlobalSessionInfo(int pid, Oid *coordId, int *coordPid) +{ + ProcArrayStruct *arrayP = procArray; + int index; + + *coordId = InvalidOid; + *coordPid = 0; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + /* + * Scan processes and get from it info about the parent session + */ + for (index = 0; index < arrayP->numProcs; index++) + { + volatile PGPROC *proc = &allProcs[arrayP->pgprocnos[index]]; + + if (proc->pid == pid) + { + *coordId = proc->coordId; + *coordPid = proc->coordPid; + break; + } + } + + LWLockRelease(ProcArrayLock); +} + + +/* + * GetFirstBackendId + * + * Determine BackendId of the current process. + * The caller must hold the ProcArrayLock and the global session id should + * be defined. + */ +int +GetFirstBackendId(int *numBackends, int *backends) +{ + ProcArrayStruct *arrayP = procArray; + Oid coordId = MyProc->coordId; + int coordPid = MyProc->coordPid; + int bCount = 0; + int bPids[MaxBackends]; + int index; + + Assert(OidIsValid(coordId)); + + /* Scan processes */ + for (index = 0; index < arrayP->numProcs; index++) + { + volatile PGPROC *proc = &allProcs[arrayP->pgprocnos[index]]; + + /* Skip MyProc */ + if (proc == MyProc) + continue; + + if (proc->coordId == coordId && proc->coordPid == coordPid) + { + /* BackendId is the same for all backends of the session */ + if (proc->firstBackendId != InvalidBackendId) + return proc->firstBackendId; + + bPids[bCount++] = proc->pid; + } + } + + if (*numBackends > 0) + { + int i, j; + /* + * This is not the first invocation, to prevent endless loop in case + * if first backend failed to complete initialization check if all the + * processes which were intially found are still here, throw error if + * not. + */ + for (i = 0; i < *numBackends; i++) + { + bool found = false; + + for (j = 0; j < bCount; j++) + { + if (bPids[j] == backends[i]) + { + found = true; + break; + } + } + + if (!found) + elog(ERROR, "Failed to determine BackendId for distributed session"); + } + } + else + { + *numBackends = bCount; + if (bCount) + memcpy(backends, bPids, bCount * sizeof(int)); + } + return InvalidBackendId; +} +#endif + /* * KnownAssignedXidsReset * Resets KnownAssignedXids to be empty diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 71cbd93efa..9e0ab12c97 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -4,6 +4,11 @@ * Routines for interprocess signalling * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 1ac46b87f9..61220275a3 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -3,6 +3,11 @@ * lock.c * POSTGRES primary lock mechanism * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -1233,6 +1238,79 @@ LockCheckConflicts(LockMethod lockMethodTable, return STATUS_OK; } + +#ifdef XCP + /* + * So the lock is conflicting with locks held by some other backend. + * But the backend may belong to the same distributed session. We need to + * detect such cases and either allow the lock or throw error, because + * waiting for the lock most probably would cause deadlock. + */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + if (proc->coordPid > 0) + { + /* Count locks held by this process and friends */ + int myHolding[numLockModes + 1]; + SHM_QUEUE *procLocks; + PROCLOCK *nextplock; + + /* Initialize the counters */ + for (i = 1; i <= numLockModes; i++) + myHolding[i] = 0; + otherLocks = 0; + + /* Iterate over processes associated with the lock */ + procLocks = &(lock->procLocks); + + nextplock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, lockLink)); + while (nextplock) + { + PGPROC *nextproc = nextplock->tag.myProc; + + if (nextproc->coordPid == proc->coordPid && + nextproc->coordId == proc->coordId) + { + /* + * The process belongs to same distributed session, count locks + */ + myLocks = nextplock->holdMask; + for (i = 1; i <= numLockModes; i++) + myHolding[i] += ((myLocks & LOCKBIT_ON(i)) ? 1 : 0); + } + /* get next proclock */ + nextplock = (PROCLOCK *) + SHMQueueNext(procLocks, &nextplock->lockLink, + offsetof(PROCLOCK, lockLink)); + } + + /* Summarize locks held by other processes */ + for (i = 1; i <= numLockModes; i++) + { + if (lock->granted[i] > myHolding[i]) + otherLocks |= LOCKBIT_ON(i); + } + + /* + * Yet another check. + */ + if (!(lockMethodTable->conflictTab[lockmode] & otherLocks)) + { + LWLockRelease(ProcArrayLock); + /* no conflict. OK to get the lock */ + PROCLOCK_PRINT("LockCheckConflicts: resolved as held by friend", + proclock); +#ifdef LOCK_DEBUG + elog(LOG, "Allow lock as held by the same distributed session [%u,%u] %s", + lock->tag.locktag_field1, lock->tag.locktag_field2, + lockMethodTable->lockModeNames[lockmode]); +#endif + return STATUS_OK; + } + } + LWLockRelease(ProcArrayLock); +#endif + PROCLOCK_PRINT("LockCheckConflicts: conflicting", proclock); return STATUS_FOUND; } diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 95d4b37bef..8da345ea0d 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -11,6 +11,11 @@ * LWLocks to protect its shared state. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -31,7 +36,10 @@ #include "storage/predicate.h" #include "storage/proc.h" #include "storage/spin.h" - +#ifdef XCP +#include "pgxc/nodemgr.h" +#include "pgxc/squeue.h" +#endif /* We use the ShmemLock spinlock to protect LWLockAssign */ extern slock_t *ShmemLock; @@ -201,6 +209,12 @@ NumLWLocks(void) /* predicate.c needs one per old serializable xid buffer */ numLocks += NUM_OLDSERXID_BUFFERS; +#ifdef XCP + /* squeue.c needs one per consumer node in each shared queue. + * Max number of consumers is MaxDataNodes-1 */ + numLocks += NUM_SQUEUES * (MaxDataNodes-1); +#endif + /* * Add any requested by loadable modules; for backwards-compatibility * reasons, allocate at least NUM_USER_DEFINED_LWLOCKS of them even if @@ -739,6 +753,7 @@ LWLockRelease(LWLockId lockid) } if (i < 0) elog(ERROR, "lock %d is not held", (int) lockid); + num_held_lwlocks--; for (; i < num_held_lwlocks; i++) held_lwlocks[i] = held_lwlocks[i + 1]; diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 16fe9dfb0f..66c021f0a2 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -3,6 +3,11 @@ * proc.c * routines to manage per-process shared memory data structure * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -357,6 +362,10 @@ InitProcess(void) MyProc->backendId = InvalidBackendId; MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; +#ifdef XCP + MyProc->coordId = InvalidOid; + MyProc->coordPid = 0; +#endif MyPgXact->inCommit = false; MyPgXact->vacuumFlags = 0; #ifdef PGXC @@ -518,6 +527,10 @@ InitAuxiliaryProcess(void) MyProc->backendId = InvalidBackendId; MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; +#ifdef XCP + MyProc->coordId = InvalidOid; + MyProc->coordPid = 0; +#endif #ifdef PGXC MyProc->isPooler = false; if (IsPGXCPoolerProcess()) diff --git a/src/backend/tcop/dest.c b/src/backend/tcop/dest.c index c6ab54aa3b..e7fc308e7d 100644 --- a/src/backend/tcop/dest.c +++ b/src/backend/tcop/dest.c @@ -4,6 +4,11 @@ * support for communication destinations * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -33,6 +38,9 @@ #include "commands/copy.h" #include "commands/createas.h" #include "executor/functions.h" +#ifdef XCP +#include "executor/producerReceiver.h" +#endif #include "executor/tstoreReceiver.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" @@ -125,6 +133,11 @@ CreateDestReceiver(CommandDest dest) case DestSQLFunction: return CreateSQLFunctionDestReceiver(); + +#ifdef XCP + case DestProducer: + return CreateProducerDestReceiver(); +#endif } /* should never get here */ diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index b99320f529..633b69b8db 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3,6 +3,11 @@ * postgres.c * POSTGRES C Backend Interface * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -89,10 +94,14 @@ /* PGXC_COORD */ #include "pgxc/execRemote.h" #include "pgxc/barrier.h" -#include "optimizer/pgxcplan.h" +#include "pgxc/planner.h" #include "nodes/nodes.h" #include "pgxc/poolmgr.h" #include "pgxc/pgxcnode.h" +#ifdef XCP +#include "pgxc/pause.h" +#include "pgxc/squeue.h" +#endif #include "commands/copy.h" /* PGXC_DATANODE */ #include "access/transam.h" @@ -374,10 +383,105 @@ SocketBackend(StringInfo inBuf) { int qtype; +#ifdef XCP + /* + * Session from data node may need to do some background work if it is + * running producing subplans. So just poll the connection, and if it does + * not have input for us do the work. + * If we do not have producing portals we should use the blocking read + * to avoid loop consuming 100% of CPU + */ + if (IS_PGXC_DATANODE && IsConnFromDatanode()) + { + /* + * Advance producing portals or poll client connection until we have + * a client command to handle. + */ + while (true) + { + unsigned char c; + + qtype = pq_getbyte_if_available(&c); + if (qtype == 0) /* no commands, do producing */ + { + /* + * No command yet, try to advance producing portals, and + * depending on result do: + * -1 No producing portals, block and wait for client command + * 0 All producing portals are paused, sleep for a moment and + * then check again either we have client command or some + * portal is awaken. + * 1 check for client command and more continue advancing + * producers immediately + */ + int activePortals = -1; + ListCell *lc = list_head(getProducingPortals()); + while (lc) + { + Portal p = (Portal) lfirst(lc); + int result; + + /* + * Get next already, because next call may remove cell from + * the list and invalidate next reference + */ + lc = lnext(lc); + + result = AdvanceProducingPortal(p, true); + if (result == 0) + { + /* Portal is paused */ + if (activePortals < 0) + activePortals = 0; + } + else if (result > 0) + { + if (activePortals < 0) + activePortals = result; + else + activePortals += result; + } + } + if (activePortals < 0) + { + /* no producers at all, we may wait while next command */ + qtype = pq_getbyte(); + break; + } + else if (activePortals == 0) + { + /* all producers are paused, sleep a little to allow other + * processes to go */ + pg_usleep(10000L); + } + } + else if (qtype == 1) + { + /* command code in c is defined, move it to qtype + * and break to handle the command */ + qtype = c; + break; + } + else + { + /* error, default handling, qtype is already set to EOF */ + break; + } + } + } + else + { + /* + * Get message type code from the frontend. + */ + qtype = pq_getbyte(); + } +#else /* * Get message type code from the frontend. */ qtype = pq_getbyte(); +#endif if (qtype == EOF) /* frontend disconnected */ { @@ -449,6 +553,9 @@ SocketBackend(StringInfo inBuf) break; case 'B': /* bind */ +#ifdef XCP /* PGXC_DATANODE */ + case 'p': /* plan */ +#endif case 'C': /* close */ case 'D': /* describe */ case 'E': /* execute */ @@ -666,6 +773,7 @@ pg_analyze_and_rewrite(Node *parsetree, const char *query_string, querytree_list = pg_rewrite_query(query); #ifdef PGXC +#ifndef XCP if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { ListCell *lc; @@ -679,6 +787,7 @@ pg_analyze_and_rewrite(Node *parsetree, const char *query_string, } } #endif +#endif TRACE_POSTGRESQL_QUERY_REWRITE_DONE(query_string); @@ -719,6 +828,9 @@ pg_analyze_and_rewrite_params(Node *parsetree, if (post_parse_analyze_hook) (*post_parse_analyze_hook) (pstate, query); + if (post_parse_analyze_hook) + (*post_parse_analyze_hook) (pstate, query); + free_parsestate(pstate); if (log_parser_stats) @@ -953,6 +1065,37 @@ exec_simple_query(const char *query_string) */ parsetree_list = pg_parse_query(query_string); +#ifdef XCP + if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && + list_length(parsetree_list) > 1) + { + /* + * There is a bug in old code, if one query contains multiple utility + * statements, entire query may be sent multiple times to the Datanodes + * for execution. That is becoming a severe problem, if query contains + * COMMIT or ROLLBACK. After executed for the first time the transaction + * handling statement would write CLOG entry for current xid, but other + * executions would be done with the same xid, causing PANIC on the + * Datanodes because of already existing CLOG record. Datanode is + * restarting all sessions if it PANICs, and affects all cluster users. + * Multiple utility statements may result in strange error messages, + * but somteime they work, and used in many applications, so we do not + * want to disable them completely, just protect against severe + * vulnerability here. + */ + foreach(parsetree_item, parsetree_list) + { + Node *parsetree = (Node *) lfirst(parsetree_item); + + if (IsTransactionExitStmt(parsetree)) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + errmsg("COMMIT or ROLLBACK " + "in multi-statement queries not allowed"))); + } + } +#endif + /* Log immediately if dictated by log_statement */ if (check_log_statement(parsetree_list)) { @@ -1423,6 +1566,7 @@ exec_parse_message(const char *query_string, /* string to execute */ querytree_list = pg_rewrite_query(query); #ifdef PGXC +#ifndef XCP if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { ListCell *lc; @@ -1436,6 +1580,7 @@ exec_parse_message(const char *query_string, /* string to execute */ } } #endif +#endif /* Done with the snapshot used for parsing */ if (snapshot_set) @@ -1534,6 +1679,143 @@ exec_parse_message(const char *query_string, /* string to execute */ debug_query_string = NULL; } +#ifdef XCP +/* + * exec_plan_message + * + * Execute a "Plan" protocol message - already planned statement. + */ +static void +exec_plan_message(const char *query_string, /* source of the query */ + const char *stmt_name, /* name for prepared stmt */ + const char *plan_string, /* encoded plan to execute */ + char **paramTypeNames, /* parameter type names */ + int numParams) /* number of parameters */ +{ + MemoryContext oldcontext; + bool save_log_statement_stats = log_statement_stats; + char msec_str[32]; + Oid *paramTypes; + CachedPlanSource *psrc; + + /* Statement name should not be empty */ + Assert(stmt_name[0]); + + /* + * Report query to various monitoring facilities. + */ + debug_query_string = query_string; + + pgstat_report_activity(STATE_RUNNING, query_string); + + set_ps_display("PLAN", false); + + if (save_log_statement_stats) + ResetUsage(); + + ereport(DEBUG2, + (errmsg("plan %s: %s", + *stmt_name ? stmt_name : "<unnamed>", + query_string))); + + /* + * Start up a transaction command so we can decode plan etc. (Note + * that this will normally change current memory context.) Nothing happens + * if we are already in one. + */ + start_xact_command(); + + /* + * XXX + * Postgres decides about memory context to use based on "named/unnamed" + * assuming named statement is executed multiple times and unnamed is + * executed once. + * Plan message always provide statement name, but we may use different + * criteria, like if plan is referencing "internal" parameters it probably + * will be executed multiple times, if not - once. + * So far optimize for multiple executions. + */ + /* Named prepared statement --- parse in MessageContext */ + oldcontext = MemoryContextSwitchTo(MessageContext); +// unnamed_stmt_context = +// AllocSetContextCreate(CacheMemoryContext, +// "unnamed prepared statement", +// ALLOCSET_DEFAULT_MINSIZE, +// ALLOCSET_DEFAULT_INITSIZE, +// ALLOCSET_DEFAULT_MAXSIZE); +// oldcontext = MemoryContextSwitchTo(unnamed_stmt_context); + + /* + * Determine parameter types + */ + if (numParams > 0) + { + int cnt_param; + paramTypes = (Oid *) palloc(numParams * sizeof(Oid)); + /* we don't expect type mod */ + for (cnt_param = 0; cnt_param < numParams; cnt_param++) + parseTypeString(paramTypeNames[cnt_param], ¶mTypes[cnt_param], + NULL); + + } + + /* If we got a cancel signal, quit */ + CHECK_FOR_INTERRUPTS(); + + psrc = CreateCachedPlan(NULL, query_string, stmt_name, "REMOTE SUBPLAN"); + + CompleteCachedPlan(psrc, NIL, NULL, paramTypes, numParams, NULL, NULL, + CURSOR_OPT_GENERIC_PLAN, false); + + /* + * Store the query as a prepared statement. See above comments. + */ + StorePreparedStatement(stmt_name, psrc, false); + + SetRemoteSubplan(psrc, plan_string); + + MemoryContextSwitchTo(oldcontext); + + /* + * We do NOT close the open transaction command here; that only happens + * when the client sends Sync. Instead, do CommandCounterIncrement just + * in case something happened during parse/plan. + */ + CommandCounterIncrement(); + + /* + * Send ParseComplete. + */ + if (whereToSendOutput == DestRemote) + pq_putemptymessage('1'); + + /* + * Emit duration logging if appropriate. + */ + switch (check_log_duration(msec_str, false)) + { + case 1: + ereport(LOG, + (errmsg("duration: %s ms", msec_str), + errhidestmt(true))); + break; + case 2: + ereport(LOG, + (errmsg("duration: %s ms parse %s: %s", + msec_str, + *stmt_name ? stmt_name : "<unnamed>", + query_string), + errhidestmt(true))); + break; + } + + if (save_log_statement_stats) + ShowUsage("PLAN MESSAGE STATISTICS"); + + debug_query_string = NULL; +} +#endif + /* * exec_bind_message * @@ -2741,6 +3023,14 @@ die(SIGNAL_ARGS) } } +#ifdef XCP + /* release cluster lock if holding it */ + if (cluster_ex_lock_held) + { + ReleaseClusterLock(true); + } +#endif + /* If we're still here, waken anything waiting on the process latch */ if (MyProc) SetLatch(&MyProc->procLatch); @@ -3628,7 +3918,12 @@ process_postgres_switches(int argc, char *argv[], GucContext ctx) { ereport(FATAL, (errcode(ERRCODE_SYNTAX_ERROR), +#ifdef XCP + errmsg("Postgres-XL: must start as either a Coordinator (--coordinator) or Datanode (-datanode)\n"))); +#else errmsg("Postgres-XC: must start as either a Coordinator (--coordinator) or Datanode (-datanode)\n"))); +#endif + } if (!IsPostmasterEnvironment) { @@ -3705,10 +4000,17 @@ PostgresMain(int argc, char *argv[], const char *username) int *xip; /* Timestamp info */ TimestampTz timestamp; +#ifndef XCP PoolHandle *pool_handle; +#endif remoteConnType = REMOTE_CONN_APP; #endif +#ifdef XCP + parentPGXCNode = NULL; + cluster_lock_held = false; + cluster_ex_lock_held = false; +#endif /* XCP */ /* * Initialize globals (already done if under postmaster, but not if @@ -3960,7 +4262,39 @@ PostgresMain(int argc, char *argv[], const char *username) if (!IsUnderPostmaster) PgStartTime = GetCurrentTimestamp(); -#ifdef PGXC /* PGXC_COORD */ +#ifdef PGXC + /* + * Initialize key pair to be used as object id while using advisory lock + * for backup + */ + xc_lockForBackupKey1 = Int32GetDatum(XC_LOCK_FOR_BACKUP_KEY_1); + xc_lockForBackupKey1 = Int32GetDatum(XC_LOCK_FOR_BACKUP_KEY_2); + +#ifdef XCP + if (IsUnderPostmaster) + { + /* + * Prepare to handle distributed requests. + * Do that after sending down ReadyForQuery, to avoid pooler + * blocking. + */ + start_xact_command(); + InitMultinodeExecutor(false); + finish_xact_command(); + } + + /* Set up the post parse analyze hook */ + post_parse_analyze_hook = ParseAnalyze_callback; + + /* if we exit, try to release cluster lock properly */ + on_shmem_exit(PGXCCleanClusterLock, 0); + + /* if we exit, try to release shared queues */ + on_shmem_exit(SharedQueuesCleanup, 0); + + /* If we exit, first try and clean connections and send to pool */ + on_proc_exit(PGXCNodeCleanAndRelease, 0); +#else /* If this postmaster is launched from another Coord, do not initialize handles. skip it */ if (IS_PGXC_COORDINATOR && !IsPoolHandle()) { @@ -3987,6 +4321,7 @@ PostgresMain(int argc, char *argv[], const char *username) /* If we exit, first try and clean connections and send to pool */ on_proc_exit (PGXCNodeCleanAndRelease, 0); } +#endif /* XCP */ if (IS_PGXC_DATANODE) { /* If we exit, first try and clean connection to GTM */ @@ -4142,6 +4477,15 @@ PostgresMain(int argc, char *argv[], const char *username) } ReadyForQuery(whereToSendOutput); +#ifdef XCP + /* + * Before we read any new command we now should wait while all + * already closed portals which are still producing finish their + * work. + */ + if (IS_PGXC_DATANODE && IsConnFromDatanode()) + cleanupClosedProducers(); +#endif #ifdef PGXC /* * Helps us catch any problems where we did not send down a snapshot @@ -4190,6 +4534,24 @@ PostgresMain(int argc, char *argv[], const char *username) if (ignore_till_sync && firstchar != EOF) continue; +#ifdef XCP + /* + * Acquire the ClusterLock before starting query processing. + * + * If we are inside a transaction block, this lock will be already held + * when the transaction began + * + * If the session has invoked a PAUSE CLUSTER earlier, then this lock + * will be held already in exclusive mode. No need to lock in that case + */ + if (IsUnderPostmaster && IS_PGXC_COORDINATOR && !cluster_ex_lock_held && !cluster_lock_held) + { + bool exclusive = false; + AcquireClusterLock(exclusive); + cluster_lock_held = true; + } +#endif /* XCP */ + switch (firstchar) { case 'Q': /* simple query */ @@ -4247,6 +4609,38 @@ PostgresMain(int argc, char *argv[], const char *username) } break; +#ifdef XCP + case 'p': /* plan */ + { + const char *stmt_name; + const char *query_string; + const char *plan_string; + int numParams; + char **paramTypes = NULL; + + /* Set statement_timestamp() */ + SetCurrentStatementStartTimestamp(); + + stmt_name = pq_getmsgstring(&input_message); + query_string = pq_getmsgstring(&input_message); + plan_string = pq_getmsgstring(&input_message); + numParams = pq_getmsgint(&input_message, 2); + paramTypes = (char **)palloc(numParams * sizeof(char *)); + if (numParams > 0) + { + int i; + for (i = 0; i < numParams; i++) + paramTypes[i] = (char *) + pq_getmsgstring(&input_message); + } + pq_getmsgend(&input_message); + + exec_plan_message(query_string, stmt_name, plan_string, + paramTypes, numParams); + } + break; +#endif + case 'B': /* bind */ /* Set statement_timestamp() */ SetCurrentStatementStartTimestamp(); @@ -4463,7 +4857,7 @@ PostgresMain(int argc, char *argv[], const char *username) if (xcnt > 0) { int i; - xip = malloc(xcnt * 4); + xip = malloc(xcnt * sizeof(int)); if (xip == NULL) { ereport(ERROR, @@ -4528,6 +4922,22 @@ PostgresMain(int argc, char *argv[], const char *username) errmsg("invalid frontend message type %d", firstchar))); } + +#ifdef XCP + /* + * If the connection is going idle, release the cluster lock. However + * if the session had invoked a PAUSE CLUSTER earlier, then wait for a + * subsequent UNPAUSE to release this lock + */ + if (IsUnderPostmaster && IS_PGXC_COORDINATOR && !IsAbortedTransactionBlockState() + && !IsTransactionOrTransactionBlock() + && cluster_lock_held && !cluster_ex_lock_held) + { + bool exclusive = false; + ReleaseClusterLock(exclusive); + cluster_lock_held = false; + } +#endif /* XCP */ } /* end of input-reading loop */ /* can't get here because the above loop never exits */ diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 3524410025..afc4d0f774 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -3,6 +3,11 @@ * pquery.c * POSTGRES process query command code * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -20,9 +25,14 @@ #include "executor/tstoreReceiver.h" #include "miscadmin.h" #include "pg_trace.h" +#ifdef XCP +#include "catalog/pgxc_node.h" +#include "executor/producerReceiver.h" +#include "pgxc/nodemgr.h" +#endif #ifdef PGXC #include "pgxc/pgxc.h" -#include "optimizer/pgxcplan.h" +#include "pgxc/planner.h" #include "pgxc/execRemote.h" #include "access/relscan.h" #endif @@ -60,7 +70,6 @@ static long DoPortalRunFetch(Portal portal, DestReceiver *dest); static void DoPortalRewind(Portal portal); - /* * CreateQueryDesc */ @@ -93,6 +102,11 @@ CreateQueryDesc(PlannedStmt *plannedstmt, qd->planstate = NULL; qd->totaltime = NULL; +#ifdef XCP + qd->squeue = NULL; + qd->myindex = -1; +#endif + return qd; } @@ -347,6 +361,11 @@ ChoosePortalStrategy(List *stmts) { PlannedStmt *pstmt = (PlannedStmt *) stmt; +#ifdef XCP + if (list_length(pstmt->distributionRestrict) > 1) + return PORTAL_DISTRIBUTED; +#endif + if (pstmt->canSetTag) { if (pstmt->commandType == CMD_SELECT && @@ -530,7 +549,11 @@ PortalStart(Portal portal, ParamListInfo params, ResourceOwner saveResourceOwner; MemoryContext savePortalContext; MemoryContext oldContext; +#ifdef XCP + QueryDesc *queryDesc = NULL; +#else QueryDesc *queryDesc; +#endif int myeflags; AssertArg(PortalIsValid(portal)); @@ -563,6 +586,201 @@ PortalStart(Portal portal, ParamListInfo params, */ switch (portal->strategy) { +#ifdef XCP + case PORTAL_DISTRIBUTED: + /* No special ability is needed */ + eflags = 0; + /* Must set snapshot before starting executor. */ + if (use_active_snapshot) + PushActiveSnapshot(GetActiveSnapshot()); + else + PushActiveSnapshot(GetTransactionSnapshot()); + + /* + * Create QueryDesc in portal's context; for the moment, set + * the destination to DestNone. + */ + queryDesc = CreateQueryDesc((PlannedStmt *) linitial(portal->stmts), + portal->sourceText, + GetActiveSnapshot(), + InvalidSnapshot, + None_Receiver, + params, + 0); + /* + * If parent node have sent down parameters, and at least one + * of them is PARAM_EXEC we should avoid "single execution" + * model. All parent nodes deliver the same values for + * PARAM_EXTERN since these values are provided by client and + * they are not changed during the query execution. + * On the conrary, values of PARAM_EXEC are results of execution + * on the parent node and in general diferent parents send to + * this node different values and executions are not equivalent. + * Since PARAM_EXECs are always at the end of the list we just + * need to check last item to figure out if there are any + * PARAM_EXECs. + * NB: Check queryDesc->plannedstmt->nParamExec > 0 is incorrect + * here since queryDesc->plannedstmt->nParamExec may be used + * just to allocate space for them and no actual values passed. + */ + if (queryDesc->plannedstmt->nParamRemote > 0 && + queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC) + { + int *consMap; + int len; + int selfid; /* Node Id of the parent data node */ + char ntype = PGXC_NODE_DATANODE; + ListCell *lc; + int i; + Locator *locator; + Oid keytype; + DestReceiver *dest; + + len = list_length(queryDesc->plannedstmt->distributionNodes); + consMap = (int *) palloc0(len * sizeof(int)); + queryDesc->squeue = NULL; + queryDesc->myindex = -1; + selfid = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE, + &ntype); + i = 0; + foreach(lc, queryDesc->plannedstmt->distributionNodes) + { + if (selfid == lfirst_int(lc)) + consMap[i] = SQ_CONS_SELF; + else + consMap[i] = SQ_CONS_NONE; + i++; + } + /* + * Multiple executions of the RemoteSubplan may lead to name + * conflict of SharedQueue, if the subplan has more + * RemoteSubplan nodes in the execution plan tree. + * We need to make them unique. + */ + RemoteSubplanMakeUnique( + (Node *) queryDesc->plannedstmt->planTree, + selfid); + /* + * Call ExecutorStart to prepare the plan for execution + */ + ExecutorStart(queryDesc, eflags); + + /* + * Set up locator if result distribution is requested + */ + keytype = queryDesc->plannedstmt->distributionKey == InvalidAttrNumber ? + InvalidOid : + queryDesc->tupDesc->attrs[queryDesc->plannedstmt->distributionKey-1]->atttypid; + locator = createLocator( + queryDesc->plannedstmt->distributionType, + RELATION_ACCESS_INSERT, + keytype, + LOCATOR_LIST_INT, + len, + consMap, + NULL, + false); + dest = CreateDestReceiver(DestProducer); + SetProducerDestReceiverParams(dest, + queryDesc->plannedstmt->distributionKey, + locator, queryDesc->squeue); + queryDesc->dest = dest; + } + else + { + int *consMap; + int len; + + /* Distributed data requested, bind shared queue for data exchange */ + len = list_length(queryDesc->plannedstmt->distributionNodes); + consMap = (int *) palloc(len * sizeof(int)); + queryDesc->squeue = SharedQueueBind(portal->name, + queryDesc->plannedstmt->distributionRestrict, + queryDesc->plannedstmt->distributionNodes, + &queryDesc->myindex, consMap); + if (queryDesc->myindex == -1) + { + /* producer */ + Locator *locator; + Oid keytype; + DestReceiver *dest; + + PG_TRY(); + { + /* + * Call ExecutorStart to prepare the plan for execution + */ + ExecutorStart(queryDesc, eflags); + } + PG_CATCH(); + { + /* Ensure SharedQueue is released */ + SharedQueueUnBind(queryDesc->squeue); + queryDesc->squeue = NULL; + PG_RE_THROW(); + } + PG_END_TRY(); + + /* + * This tells PortalCleanup to shut down the executor + */ + portal->queryDesc = queryDesc; + + /* + * Set up locator if result distribution is requested + */ + keytype = queryDesc->plannedstmt->distributionKey == InvalidAttrNumber ? + InvalidOid : + queryDesc->tupDesc->attrs[queryDesc->plannedstmt->distributionKey-1]->atttypid; + locator = createLocator( + queryDesc->plannedstmt->distributionType, + RELATION_ACCESS_INSERT, + keytype, + LOCATOR_LIST_INT, + len, + consMap, + NULL, + false); + dest = CreateDestReceiver(DestProducer); + SetProducerDestReceiverParams(dest, + queryDesc->plannedstmt->distributionKey, + locator, queryDesc->squeue); + queryDesc->dest = dest; + + addProducingPortal(portal); + } + else + { + /* + * We do not need to initialize executor, but need + * a tuple descriptor + */ + queryDesc->tupDesc = ExecCleanTypeFromTL( + queryDesc->plannedstmt->planTree->targetlist, + false); + } + pfree(consMap); + } + + portal->queryDesc = queryDesc; + + /* + * Remember tuple descriptor (computed by ExecutorStart) + */ + portal->tupDesc = queryDesc->tupDesc; + + /* + * Reset cursor position data to "start of query" + */ + portal->atStart = true; + portal->atEnd = false; /* allow fetches */ + portal->portalPos = 0; + portal->posOverflow = false; + + PopActiveSnapshot(); + break; +#endif + case PORTAL_ONE_SELECT: /* Must set snapshot before starting executor. */ @@ -678,6 +896,17 @@ PortalStart(Portal portal, ParamListInfo params, /* Uncaught error while executing portal: mark it dead */ MarkPortalFailed(portal); +#ifdef XCP + if (queryDesc && queryDesc->squeue) + { + /* + * Associate the query desc with the portal so it is unbound upon + * transaction end. + */ + portal->queryDesc = queryDesc; + } +#endif + /* Restore global vars and propagate error */ ActivePortal = saveActivePortal; CurrentResourceOwner = saveResourceOwner; @@ -888,6 +1117,175 @@ PortalRun(Portal portal, long count, bool isTopLevel, result = true; break; +#ifdef XCP + case PORTAL_DISTRIBUTED: + if (count == FETCH_ALL) + count = 0; + nprocessed = 0; + + if (portal->queryDesc->myindex == -1) + { + long oldPos; + + if (portal->queryDesc->squeue) + { + /* Make sure the producer is advancing */ + while (count == 0 || nprocessed < count) + { + if (!portal->queryDesc->estate->es_finished) + AdvanceProducingPortal(portal, false); + /* make read pointer active */ + tuplestore_select_read_pointer(portal->holdStore, 1); + /* perform reads */ + nprocessed += RunFromStore(portal, + ForwardScanDirection, + count ? count - nprocessed : 0, + dest); + /* + * Switch back to the write pointer + * We do not want to seek if the tuplestore operates + * with a file, so copy pointer before. + * Also advancing write pointer would allow to free some + * memory. + */ + tuplestore_copy_read_pointer(portal->holdStore, 1, 0); + tuplestore_select_read_pointer(portal->holdStore, 0); + /* try to release occupied memory */ + tuplestore_trim(portal->holdStore); + /* Break if we can not get more rows */ + if (portal->queryDesc->estate->es_finished) + break; + } + if (nprocessed > 0) + portal->atStart = false; /* OK to go backward now */ + portal->atEnd = portal->queryDesc->estate->es_finished && + tuplestore_ateof(portal->holdStore); + oldPos = portal->portalPos; + portal->portalPos += nprocessed; + /* portalPos doesn't advance when we fall off the end */ + if (portal->portalPos < oldPos) + portal->posOverflow = true; + } + else + { + DestReceiver *olddest; + + Assert(portal->queryDesc->dest->mydest == DestProducer); + olddest = SetSelfConsumerDestReceiver( + portal->queryDesc->dest, dest); + /* + * Now fetch desired portion of results. + */ + nprocessed = PortalRunSelect(portal, true, count, + portal->queryDesc->dest); + SetSelfConsumerDestReceiver( + portal->queryDesc->dest, olddest); + } + } + else + { + QueryDesc *queryDesc = portal->queryDesc; + SharedQueue squeue = queryDesc->squeue; + int myindex = queryDesc->myindex; + TupleTableSlot *slot; + long oldPos; + + /* + * We are the consumer. + * We have skipped plan initialization, hence we do not have + * a tuple table to get a slot to receive tuples, so prepare + * standalone slot. + */ + slot = MakeSingleTupleTableSlot(queryDesc->tupDesc); + + (*dest->rStartup) (dest, CMD_SELECT, queryDesc->tupDesc); + + /* + * Loop until we've processed the proper number of tuples + * from the plan. + */ + for (;;) + { + List *producing = getProducingPortals(); + bool done; + + /* + * Obtain a tuple from the queue. + * If the session is running producing cursors it is + * not safe to wait for available tuple. Two sessions + * may deadlock each other. So if session is producing + * it should keep advancing producing cursors. + */ + done = SharedQueueRead(squeue, myindex, slot, + list_length(producing) == 0); + + /* + * if the tuple is null, then we assume there is nothing + * more to process so we end the loop... + * Also if null tuple is returned the squeue is reset + * already, we want to prevent resetting it again + */ + if (TupIsNull(slot)) + { + if (!done && producing) + { + /* No data to read, advance producing portals */ + ListCell *lc = list_head(producing); + while (lc) + { + Portal p = (Portal) lfirst(lc); + /* Get reference to next entry before + * advancing current portal, because the + * function may remove current entry from + * the list. + */ + lc = lnext(lc); + + AdvanceProducingPortal(p, false); + } + continue; + } + else + { + queryDesc->squeue = NULL; + break; + } + } + /* + * Send the tuple + */ + (*dest->receiveSlot) (slot, dest); + + /* + * increment the number of processed tuples and check count. + * If we've processed the proper number then quit, else + * loop again and process more tuples. Zero count means + * no limit. + */ + if (count && count == ++nprocessed) + break; + } + (*dest->rShutdown) (dest); + + ExecDropSingleTupleTableSlot(slot); + + if (nprocessed > 0) + portal->atStart = false; /* OK to go backward now */ + if (count == 0 || + (unsigned long) nprocessed < (unsigned long) count) + portal->atEnd = true; /* we retrieved 'em all */ + oldPos = portal->portalPos; + portal->portalPos += nprocessed; + /* portalPos doesn't advance when we fall off the end */ + if (portal->portalPos < oldPos) + portal->posOverflow = true; + } + /* Mark portal not active */ + portal->status = PORTAL_READY; + result = portal->atEnd; + break; +#endif + default: elog(ERROR, "unrecognized portal strategy: %d", (int) portal->strategy); @@ -1010,6 +1408,7 @@ PortalRunSelect(Portal portal, PushActiveSnapshot(queryDesc->snapshot); #ifdef PGXC +#ifndef XCP if (portal->name != NULL && portal->name[0] != '\0' && IsA(queryDesc->planstate, RemoteQueryState)) @@ -1035,6 +1434,7 @@ PortalRunSelect(Portal portal, rqs->cursor = pstrdup(portal->name); } #endif +#endif ExecutorRun(queryDesc, direction, count); nprocessed = queryDesc->estate->es_processed; @@ -1270,7 +1670,13 @@ PortalRunUtility(Portal portal, Node *utilityStmt, bool isTopLevel, IsA(utilityStmt, NotifyStmt) || IsA(utilityStmt, UnlistenStmt) || #ifdef PGXC +#ifdef XCP + IsA(utilityStmt, PauseClusterStmt) || + IsA(utilityStmt, BarrierStmt) || + (IsA(utilityStmt, CheckPointStmt) && IS_PGXC_DATANODE))) +#else (IsA(utilityStmt, CheckPointStmt) && IS_PGXC_DATANODE))) +#endif #else IsA(utilityStmt, CheckPointStmt))) #endif @@ -1795,3 +2201,355 @@ DoPortalRewind(Portal portal) portal->portalPos = 0; portal->posOverflow = false; } + +#ifdef XCP +/* + * Execute the specified portal's query and distribute tuples to consumers. + * Returs 1 if portal should keep producing, 0 if all consumers have enough + * rows in the buffers to pause producing temporarily, -1 if the query is + * completed. + */ +int +AdvanceProducingPortal(Portal portal, bool can_wait) +{ + Portal saveActivePortal; + ResourceOwner saveResourceOwner; + MemoryContext savePortalContext; + MemoryContext oldContext; + QueryDesc *queryDesc; + SharedQueue squeue; + DestReceiver *treceiver; + int result; + + queryDesc = PortalGetQueryDesc(portal); + squeue = queryDesc->squeue; + + Assert(queryDesc); + /* Make sure the portal is producing */ + Assert(squeue && queryDesc->myindex == -1); + /* Make sure there is proper receiver */ + Assert(queryDesc->dest && queryDesc->dest->mydest == DestProducer); + + /* + * Set up global portal context pointers. + */ + saveActivePortal = ActivePortal; + saveResourceOwner = CurrentResourceOwner; + savePortalContext = PortalContext; + PG_TRY(); + { + ActivePortal = portal; + CurrentResourceOwner = portal->resowner; + PortalContext = PortalGetHeapMemory(portal); + + oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(portal)); + + /* + * That is the first pass thru if the hold store is not initialized yet, + * Need to initialize stuff. + */ + if (portal->holdStore == NULL && portal->status != PORTAL_FAILED) + { + int idx; + char storename[64]; + + PortalCreateProducerStore(portal); + treceiver = CreateDestReceiver(DestTuplestore); + SetTuplestoreDestReceiverParams(treceiver, + portal->holdStore, + portal->holdContext, + false); + SetSelfConsumerDestReceiver(queryDesc->dest, treceiver); + SetProducerTempMemory(queryDesc->dest, portal->tmpContext); + snprintf(storename, 64, "%s producer store", portal->name); + tuplestore_collect_stat(portal->holdStore, storename); + /* + * Tuplestore does not clear eof flag on the active read pointer, + * causing the store is always in EOF state once reached when + * there is a single read pointer. We do not want behavior like this + * and workaround by using secondary read pointer. + * Primary read pointer (0) is active when we are writing to + * the tuple store, secondary read pointer is for reading, and its + * eof flag is cleared if a tuple is written to the store. + * We know the extra read pointer has index 1, so do not store it. + */ + idx = tuplestore_alloc_read_pointer(portal->holdStore, 0); + Assert(idx == 1); + } + + if (queryDesc->estate && !queryDesc->estate->es_finished && + portal->status != PORTAL_FAILED) + { + /* + * If the portal's hold store has tuples available for read and + * all consumer queues are not empty we skip advancing the portal + * (pause it) to prevent buffering too many rows at the producer. + * NB just created portal store would not be in EOF state, but in + * this case consumer queues will be empty and do not allow + * erroneous pause. After the first call to AdvanceProducingPortal + * portal will try to read the hold store and EOF flag will be set + * correctly. + */ + tuplestore_select_read_pointer(portal->holdStore, 1); + if (!tuplestore_ateof(portal->holdStore) && + SharedQueueCanPause(squeue)) + result = 0; + else + result = 1; + tuplestore_select_read_pointer(portal->holdStore, 0); + + if (result) + { + /* Execute query and dispatch tuples via dest receiver */ +#define PRODUCE_TUPLES 100 + PushActiveSnapshot(queryDesc->snapshot); + ExecutorRun(queryDesc, ForwardScanDirection, PRODUCE_TUPLES); + PopActiveSnapshot(); + + if (queryDesc->estate->es_processed < PRODUCE_TUPLES) + { + /* + * Finish the executor, but we may still have some tuples + * in the local storages. + * We should keep trying pushing them into the squeue, so do not + * remove the portal from the list of producers. + */ + ExecutorFinish(queryDesc); + } + } + } + + /* Try to dump local tuplestores */ + if ((queryDesc->estate == NULL || queryDesc->estate->es_finished) && + ProducerReceiverPushBuffers(queryDesc->dest)) + { + if (can_wait && queryDesc->estate == NULL) + { + (*queryDesc->dest->rDestroy) (queryDesc->dest); + queryDesc->dest = NULL; + portal->queryDesc = NULL; + squeue = NULL; + + removeProducingPortal(portal); + FreeQueryDesc(queryDesc); + + /* + * Current context is the portal context, which is going + * to be deleted + */ + MemoryContextSwitchTo(TopTransactionContext); + + ActivePortal = saveActivePortal; + CurrentResourceOwner = saveResourceOwner; + PortalContext = savePortalContext; + + if (portal->resowner) + { + bool isCommit = (portal->status != PORTAL_FAILED); + + ResourceOwnerRelease(portal->resowner, + RESOURCE_RELEASE_BEFORE_LOCKS, + isCommit, false); + ResourceOwnerRelease(portal->resowner, + RESOURCE_RELEASE_LOCKS, + isCommit, false); + ResourceOwnerRelease(portal->resowner, + RESOURCE_RELEASE_AFTER_LOCKS, + isCommit, false); + ResourceOwnerDelete(portal->resowner); + } + portal->resowner = NULL; + + /* + * Delete tuplestore if present. We should do this even under error + * conditions; since the tuplestore would have been using cross- + * transaction storage, its temp files need to be explicitly deleted. + */ + if (portal->holdStore) + { + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(portal->holdContext); + tuplestore_end(portal->holdStore); + MemoryContextSwitchTo(oldcontext); + portal->holdStore = NULL; + } + + /* delete tuplestore storage, if any */ + if (portal->holdContext) + MemoryContextDelete(portal->holdContext); + + /* release subsidiary storage */ + MemoryContextDelete(PortalGetHeapMemory(portal)); + + /* release portal struct (it's in PortalMemory) */ + pfree(portal); + } + /* report portal is not producing */ + result = -1; + } + else + { + result = SharedQueueCanPause(queryDesc->squeue) ? 0 : 1; + } + } + PG_CATCH(); + { + /* Uncaught error while executing portal: mark it dead */ + portal->status = PORTAL_FAILED; + /* + * Reset producer to allow consumers to finish, so receiving node will + * handle the error. + */ + if (squeue) + SharedQueueReset(squeue, -1); + + /* Restore global vars and propagate error */ + ActivePortal = saveActivePortal; + CurrentResourceOwner = saveResourceOwner; + PortalContext = savePortalContext; + + PG_RE_THROW(); + } + PG_END_TRY(); + + MemoryContextSwitchTo(oldContext); + + ActivePortal = saveActivePortal; + CurrentResourceOwner = saveResourceOwner; + PortalContext = savePortalContext; + + return result; +} + + +/* + * Iterate over producing portal, determine already closed, and clean them up, + * waiting while consumers finish their work. Closed producers should be + * cleaned up and resources are released before proceeding with handling of + * next request. + */ +void +cleanupClosedProducers(void) +{ + ListCell *lc = list_head(getProducingPortals()); + while (lc) + { + Portal p = (Portal) lfirst(lc); + QueryDesc *queryDesc = PortalGetQueryDesc(p); + SharedQueue squeue = queryDesc->squeue; + + /* + * Get next already, because next call may remove cell from + * the list and invalidate next reference + */ + lc = lnext(lc); + + /* When portal is closed executor state is not set */ + if (queryDesc->estate == NULL) + { + /* + * Set up global portal context pointers. + */ + Portal saveActivePortal = ActivePortal; + ResourceOwner saveResourceOwner = CurrentResourceOwner; + MemoryContext savePortalContext = PortalContext; + + PG_TRY(); + { + MemoryContext oldContext; + ActivePortal = p; + CurrentResourceOwner = p->resowner; + PortalContext = PortalGetHeapMemory(p); + + oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(p)); + + (*queryDesc->dest->rDestroy) (queryDesc->dest); + queryDesc->dest = NULL; + p->queryDesc = NULL; + squeue = NULL; + + removeProducingPortal(p); + FreeQueryDesc(queryDesc); + + /* + * Current context is the portal context, which is going + * to be deleted + */ + MemoryContextSwitchTo(TopTransactionContext); + + ActivePortal = saveActivePortal; + CurrentResourceOwner = saveResourceOwner; + PortalContext = savePortalContext; + + if (p->resowner) + { + bool isCommit = (p->status != PORTAL_FAILED); + + ResourceOwnerRelease(p->resowner, + RESOURCE_RELEASE_BEFORE_LOCKS, + isCommit, false); + ResourceOwnerRelease(p->resowner, + RESOURCE_RELEASE_LOCKS, + isCommit, false); + ResourceOwnerRelease(p->resowner, + RESOURCE_RELEASE_AFTER_LOCKS, + isCommit, false); + ResourceOwnerDelete(p->resowner); + } + p->resowner = NULL; + + /* + * Delete tuplestore if present. We should do this even under error + * conditions; since the tuplestore would have been using cross- + * transaction storage, its temp files need to be explicitly deleted. + */ + if (p->holdStore) + { + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(p->holdContext); + tuplestore_end(p->holdStore); + MemoryContextSwitchTo(oldcontext); + p->holdStore = NULL; + } + + /* delete tuplestore storage, if any */ + if (p->holdContext) + MemoryContextDelete(p->holdContext); + + /* release subsidiary storage */ + MemoryContextDelete(PortalGetHeapMemory(p)); + + /* release portal struct (it's in PortalMemory) */ + pfree(p); + + MemoryContextSwitchTo(oldContext); + } + PG_CATCH(); + { + /* Uncaught error while executing portal: mark it dead */ + p->status = PORTAL_FAILED; + /* + * Reset producer to allow consumers to finish, so receiving node will + * handle the error. + */ + if (squeue) + SharedQueueReset(squeue, -1); + + /* Restore global vars and propagate error */ + ActivePortal = saveActivePortal; + CurrentResourceOwner = saveResourceOwner; + PortalContext = savePortalContext; + + PG_RE_THROW(); + } + PG_END_TRY(); + + ActivePortal = saveActivePortal; + CurrentResourceOwner = saveResourceOwner; + PortalContext = savePortalContext; + } + } +} +#endif diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index eaf510e5f5..cc3daecd62 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -5,6 +5,11 @@ * commands. At one time acted as an interface between the Lisp and C * systems. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -67,7 +72,7 @@ #include "pgxc/execRemote.h" #include "pgxc/locator.h" #include "pgxc/pgxc.h" -#include "optimizer/pgxcplan.h" +#include "pgxc/planner.h" #include "pgxc/poolutils.h" #include "nodes/nodes.h" #include "pgxc/poolmgr.h" @@ -75,8 +80,12 @@ #include "pgxc/groupmgr.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/builtins.h" #include "utils/snapmgr.h" #include "pgxc/xc_maintenance_mode.h" +#ifdef XCP +#include "pgxc/pause.h" +#endif static void ExecUtilityStmtOnNodes(const char *queryString, ExecNodes *nodes, bool sentToRemote, bool force_autocommit, RemoteQueryExecType exec_type, @@ -89,6 +98,7 @@ static RemoteQueryExecType GetNodesForCommentUtility(CommentStmt *stmt, bool *is static RemoteQueryExecType GetNodesForRulesUtility(RangeVar *relation, bool *is_temp); static void DropStmtPreTreatment(DropStmt *stmt, const char *queryString, bool sentToRemote, bool *is_temp, RemoteQueryExecType *exec_type); +static bool IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString); static void ExecUtilityWithMessage(const char *queryString, bool sentToRemote, bool is_temp); #endif @@ -389,6 +399,39 @@ standard_ProcessUtility(Node *parsetree, #endif /* PGXC */ char *completionTag) { +#ifdef PGXC + /* + * For more detail see comments in function pgxc_lock_for_backup. + * + * Cosider the following scenario: + * Imagine a two cordinator cluster CO1, CO2 + * Suppose a client connected to CO1 issues select pgxc_lock_for_backup() + * Now assume that a client connected to CO2 issues a create table + * select pgxc_lock_for_backup() would try to acquire the advisory lock + * in exclusive mode, whereas create table would try to acquire the same + * lock in shared mode. Both these requests will always try acquire the + * lock in the same order i.e. they would both direct the request first to + * CO1 and then to CO2. One of the two requests would therefore pass + * and the other would fail. + * + * Consider another scenario: + * Suppose we have a two cooridnator cluster CO1 and CO2 + * Assume one client connected to each coordinator + * Further assume one client starts a transaction + * and issues a DDL. This is an unfinished transaction. + * Now assume the second client issues + * select pgxc_lock_for_backup() + * This request would fail because the unfinished transaction + * would already hold the advisory lock. + */ + if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && IsNormalProcessingMode()) + { + /* Is the statement a prohibited one? */ + if (!IsStmtAllowedInLockedMode(parsetree, queryString)) + pgxc_lock_for_utility_stmt(parsetree); + } +#endif + check_xact_readonly(parsetree); if (completionTag) @@ -413,6 +456,7 @@ standard_ProcessUtility(Node *parsetree, case TRANS_STMT_START: { ListCell *lc; + BeginTransactionBlock(); foreach(lc, stmt->options) { @@ -606,8 +650,20 @@ standard_ProcessUtility(Node *parsetree, #endif /* Run parse analysis ... */ +#ifdef XCP + /* + * If sentToRemote is set it is either EXECUTE DIRECT or part + * of extencion definition script, that is a kind of extension + * specific metadata table. So it makes sense do not distribute + * the relation. If someone sure he needs the table distributed + * it should explicitly specify distribution. + */ + stmts = transformCreateStmt((CreateStmt *) parsetree, + queryString, !sentToRemote); +#else stmts = transformCreateStmt((CreateStmt *) parsetree, queryString); +#endif #ifdef PGXC if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { @@ -667,8 +723,12 @@ standard_ProcessUtility(Node *parsetree, * Coordinator, if not already done so */ if (!sentToRemote) +#ifdef XCP + stmts = AddRemoteQueryNode(stmts, queryString, is_temp ? EXEC_ON_DATANODES : EXEC_ON_ALL_NODES); +#else stmts = AddRemoteQueryNode(stmts, queryString, EXEC_ON_ALL_NODES, is_temp); #endif +#endif /* ... and do it */ foreach(l, stmts) @@ -681,15 +741,18 @@ standard_ProcessUtility(Node *parsetree, static char *validnsps[] = HEAP_RELOPT_NAMESPACES; #ifdef PGXC +#ifndef XCP /* Set temporary object object flag in pooler */ if (is_temp) PoolManagerSetCommand(POOL_CMD_TEMP, NULL); #endif +#endif /* Create the table itself */ relOid = DefineRelation((CreateStmt *) stmt, RELKIND_RELATION, InvalidOid); + /* * Let AlterTableCreateToastTable decide if this one * needs a secondary relation too. @@ -702,7 +765,6 @@ standard_ProcessUtility(Node *parsetree, "toast", validnsps, true, false); - (void) heap_reloptions(RELKIND_TOASTVALUE, toast_options, true); @@ -800,7 +862,11 @@ standard_ProcessUtility(Node *parsetree, #ifdef PGXC ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), +#ifdef XCP + errmsg("Postgres-XL does not support FOREIGN DATA WRAPPER yet"), +#else errmsg("Postgres-XC does not support FOREIGN DATA WRAPPER yet"), +#endif errdetail("The feature is not currently supported"))); #endif CreateForeignDataWrapper((CreateFdwStmt *) parsetree); @@ -814,7 +880,11 @@ standard_ProcessUtility(Node *parsetree, #ifdef PGXC ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), +#ifdef XCP + errmsg("Postgres-XL does not support SERVER yet"), +#else errmsg("Postgres-XC does not support SERVER yet"), +#endif errdetail("The feature is not currently supported"))); #endif CreateForeignServer((CreateForeignServerStmt *) parsetree); @@ -828,7 +898,11 @@ standard_ProcessUtility(Node *parsetree, #ifdef PGXC ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), +#ifdef XCP + errmsg("Postgres-XL does not support USER MAPPING yet"), +#else errmsg("Postgres-XC does not support USER MAPPING yet"), +#endif errdetail("The feature is not currently supported"))); #endif CreateUserMapping((CreateUserMappingStmt *) parsetree); @@ -895,15 +969,34 @@ standard_ProcessUtility(Node *parsetree, break; case T_TruncateStmt: + ExecuteTruncate((TruncateStmt *) parsetree); #ifdef PGXC /* - * In Postgres-XC, TRUNCATE needs to be launched to remote nodes - * before AFTER triggers. As this needs an internal control it is - * managed by this function internally. + * Check details of the object being truncated. + * If at least one temporary table is truncated truncate cannot use 2PC + * at commit. */ - ExecuteTruncate((TruncateStmt *) parsetree, queryString); -#else - ExecuteTruncate((TruncateStmt *) parsetree); + if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + { + bool is_temp = false; + ListCell *cell; + TruncateStmt *stmt = (TruncateStmt *) parsetree; + + foreach(cell, stmt->relations) + { + Oid relid; + RangeVar *rel = (RangeVar *) lfirst(cell); + + relid = RangeVarGetRelid(rel, NoLock, false); + if (IsTempTable(relid)) + { + is_temp = true; + break; + } + } + + ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_DATANODES, is_temp); + } #endif break; @@ -929,6 +1022,7 @@ standard_ProcessUtility(Node *parsetree, case T_CopyStmt: { uint64 processed; + processed = DoCopy((CopyStmt *) parsetree, queryString); if (completionTag) snprintf(completionTag, COMPLETION_TAG_BUFSIZE, @@ -1056,7 +1150,7 @@ standard_ProcessUtility(Node *parsetree, { AlterTableStmt *atstmt = (AlterTableStmt *) parsetree; Oid relid; - List *stmts = NIL; + List *stmts; ListCell *l; LOCKMODE lockmode; @@ -1092,7 +1186,11 @@ standard_ProcessUtility(Node *parsetree, relid, &is_temp); +#ifdef XCP + stmts = AddRemoteQueryNode(stmts, queryString, exec_type); +#else stmts = AddRemoteQueryNode(stmts, queryString, exec_type, is_temp); +#endif } } #endif @@ -1367,7 +1465,13 @@ standard_ProcessUtility(Node *parsetree, #ifdef PGXC if (IS_PGXC_COORDINATOR) { +#ifdef XCP + ViewStmt *stmt = (ViewStmt *) parsetree; + + if (stmt->view->relpersistence != RELPERSISTENCE_TEMP) +#else if (!ExecIsTempObjectIncluded()) +#endif ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false); } #endif @@ -1479,9 +1583,11 @@ standard_ProcessUtility(Node *parsetree, { bool is_temp = stmt->sequence->relpersistence == RELPERSISTENCE_TEMP; +#ifndef XCP /* Set temporary object flag in pooler */ if (is_temp) PoolManagerSetCommand(POOL_CMD_TEMP, NULL); +#endif ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, is_temp); } @@ -1572,7 +1678,11 @@ standard_ProcessUtility(Node *parsetree, /* Clean also remote Coordinators */ sprintf(query, "CLEAN CONNECTION TO ALL FOR DATABASE %s;", stmt->dbname); +#ifdef XCP + ExecUtilityStmtOnNodes(query, NULL, sentToRemote, true, EXEC_ON_ALL_NODES, false); +#else ExecUtilityStmtOnNodes(query, NULL, sentToRemote, true, EXEC_ON_COORDS, false); +#endif } #endif @@ -1651,12 +1761,12 @@ standard_ProcessUtility(Node *parsetree, /* we choose to allow this during "read only" transactions */ PreventCommandDuringRecovery("VACUUM"); #ifdef PGXC - /* - * We have to run the command on nodes before Coordinator because - * vacuum() pops active snapshot and we can not send it to nodes - */ - if (IS_PGXC_COORDINATOR) - ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, true, EXEC_ON_DATANODES, false); + /* + * We have to run the command on nodes before Coordinator because + * vacuum() pops active snapshot and we can not send it to nodes + */ + if (IS_PGXC_COORDINATOR) + ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, true, EXEC_ON_DATANODES, false); #endif vacuum((VacuumStmt *) parsetree, InvalidOid, true, NULL, false, isTopLevel); @@ -1674,6 +1784,7 @@ standard_ProcessUtility(Node *parsetree, case T_VariableSetStmt: ExecSetVariableStmt((VariableSetStmt *) parsetree); #ifdef PGXC +#ifndef XCP /* Let the pooler manage the statement */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { @@ -1697,6 +1808,7 @@ standard_ProcessUtility(Node *parsetree, } } #endif +#endif break; case T_VariableShowStmt: @@ -1726,25 +1838,18 @@ standard_ProcessUtility(Node *parsetree, (void) CreateTrigger((CreateTrigStmt *) parsetree, queryString, InvalidOid, InvalidOid, false); #ifdef PGXC - if (IS_PGXC_COORDINATOR) - { - CreateTrigStmt *stmt = (CreateTrigStmt *) parsetree; - RemoteQueryExecType exec_type; - bool is_temp; - - /* Postgres-XC does not support yet FOR EACH ROW yet */ - if (stmt->row) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("Postgres-XC does not support ROW TRIGGER yet"), - errdetail("The feature is not currently supported"))); - - exec_type = ExecUtilityFindNodes(OBJECT_TABLE, - RangeVarGetRelid(stmt->relation, NoLock, false), - &is_temp); + /* Postgres-XC does not support yet triggers */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), +#ifdef XCP + errmsg("Postgres-XL does not support TRIGGER yet"), +#else + errmsg("Postgres-XC does not support TRIGGER yet"), +#endif + errdetail("The feature is not currently supported"))); - ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, exec_type, is_temp); - } + if (IS_PGXC_COORDINATOR) + ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false); #endif break; @@ -1835,6 +1940,18 @@ standard_ProcessUtility(Node *parsetree, case T_ConstraintsSetStmt: AfterTriggerSetState((ConstraintsSetStmt *) parsetree); #ifdef PGXC +#ifdef XCP + /* + * Just send statement to all the datanodes. It is effectively noop + * if no transaction, because transaction will be committed and + * changes will be cleared after completion. + * Side effect of that command is that session takes a connection + * to each Datanode and holds it while transaction lasts, even if + * subsequent statements won't use some of them. + */ + ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, + EXEC_ON_DATANODES, false); +#else /* * Let the pooler manage the statement, SET CONSTRAINT can just be used * inside a transaction block, hence it has no effect outside that, so use @@ -1846,6 +1963,7 @@ standard_ProcessUtility(Node *parsetree, elog(ERROR, "Postgres-XC: ERROR SET query"); } #endif +#endif break; case T_CheckPointStmt: @@ -1872,6 +1990,12 @@ standard_ProcessUtility(Node *parsetree, case T_BarrierStmt: RequestBarrier(((BarrierStmt *) parsetree)->id, completionTag); break; +#ifdef XCP + case T_PauseClusterStmt: + RequestClusterPause(((PauseClusterStmt *) parsetree)->pause, completionTag); + break; +#endif + /* * Node DDL is an operation local to Coordinator. @@ -1880,6 +2004,10 @@ standard_ProcessUtility(Node *parsetree, */ case T_AlterNodeStmt: PgxcNodeAlter((AlterNodeStmt *) parsetree); +#ifdef XCP + if (((AlterNodeStmt *) parsetree)->cluster) + ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false); +#endif break; case T_CreateNodeStmt: @@ -2007,11 +2135,21 @@ standard_ProcessUtility(Node *parsetree, break; case T_CleanConnStmt: - Assert(IS_PGXC_COORDINATOR); +#ifdef XCP + /* + * First send command to other nodes via probably existing + * connections, then clean local pooler + */ + if (IS_PGXC_COORDINATOR) + ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, true, EXEC_ON_ALL_NODES, false); + CleanConnection((CleanConnStmt *) parsetree); +#else + Assert(IS_PGXC_COORDINATOR); CleanConnection((CleanConnStmt *) parsetree); if (IS_PGXC_COORDINATOR) ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, true, EXEC_ON_COORDS, false); +#endif break; #endif default: @@ -2093,7 +2231,9 @@ ExecUtilityStmtOnNodes(const char *queryString, ExecNodes *nodes, bool sentToRem step->sql_statement = pstrdup(queryString); step->force_autocommit = force_autocommit; step->exec_type = exec_type; +#ifndef XCP step->is_temp = is_temp; +#endif ExecRemoteUtility(step); pfree(step->sql_statement); pfree(step); @@ -2124,9 +2264,7 @@ ExecUtilityFindNodes(ObjectType object_type, exec_type = EXEC_ON_ALL_NODES; break; - /* Triggers are evaluated based on the relation they are defined on */ case OBJECT_TABLE: - case OBJECT_TRIGGER: /* Do the check on relation kind */ exec_type = ExecUtilityFindNodesRelkind(object_id, is_temp); break; @@ -2176,13 +2314,21 @@ ExecUtilityFindNodesRelkind(Oid relid, bool *is_temp) switch (relkind_str) { case RELKIND_SEQUENCE: +#ifndef XCP *is_temp = IsTempTable(relid); exec_type = EXEC_ON_ALL_NODES; break; - +#endif case RELKIND_RELATION: +#ifdef XCP + if ((*is_temp = IsTempTable(relid))) + exec_type = EXEC_ON_DATANODES; + else + exec_type = EXEC_ON_ALL_NODES; +#else *is_temp = IsTempTable(relid); exec_type = EXEC_ON_ALL_NODES; +#endif break; case RELKIND_VIEW: @@ -3050,6 +3196,12 @@ CreateCommandTag(Node *parsetree) case T_DropGroupStmt: tag = "DROP NODE GROUP"; break; + +#ifdef XCP + case T_PauseClusterStmt: + tag = "PAUSE/UNPAUSE CLUSTER"; + break; +#endif #endif case T_ReindexStmt: @@ -3202,12 +3354,14 @@ CreateCommandTag(Node *parsetree) } break; +#ifdef PGXC case T_ExecDirectStmt: tag = "EXECUTE DIRECT"; break; case T_CleanConnStmt: tag = "CLEAN CONNECTION"; break; +#endif default: elog(WARNING, "unrecognized node type: %d", @@ -3651,7 +3805,18 @@ GetCommandLogLevel(Node *parsetree) lev = LOGSTMT_DDL; break; #endif - +#ifdef XCP + case T_AlterNodeStmt: + case T_CreateNodeStmt: + case T_DropNodeStmt: + case T_CreateGroupStmt: + case T_DropGroupStmt: + lev = LOGSTMT_DDL; + break; + case T_ExecDirectStmt: + lev = LOGSTMT_ALL; + break; +#endif default: elog(WARNING, "unrecognized node type: %d", (int) nodeTag(parsetree)); @@ -3664,6 +3829,97 @@ GetCommandLogLevel(Node *parsetree) #ifdef PGXC /* + * IsStmtAllowedInLockedMode + * + * Allow/Disallow a utility command while cluster is locked + * A statement will be disallowed if it makes such changes + * in catalog that are backed up by pg_dump except + * CREATE NODE that has to be allowed because + * a new node has to be created while the cluster is still + * locked for backup + */ +static bool +IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString) +{ +#define ALLOW 1 +#define DISALLOW 0 + + switch (nodeTag(parsetree)) + { + /* To allow creation of temp tables */ + case T_CreateStmt: /* CREATE TABLE */ + { + CreateStmt *stmt = (CreateStmt *) parsetree; + if (stmt->relation->relpersistence == RELPERSISTENCE_TEMP) + return ALLOW; + return DISALLOW; + } + break; + + case T_ExecuteStmt: /* + * Prepared statememts can only have + * SELECT, INSERT, UPDATE, DELETE, + * or VALUES statement, there is no + * point stopping EXECUTE. + */ + case T_CreateNodeStmt: /* + * This has to be allowed so that the new node + * can be created, while the cluster is still + * locked for backup + */ + case T_DropNodeStmt: /* + * This has to be allowed so that DROP NODE + * can be issued to drop a node that has crashed. + * Otherwise system would try to acquire a shared + * advisory lock on the crashed node. + */ + + case T_TransactionStmt: + case T_PlannedStmt: + case T_ClosePortalStmt: + case T_FetchStmt: + case T_TruncateStmt: + case T_CopyStmt: + case T_PrepareStmt: /* + * Prepared statememts can only have + * SELECT, INSERT, UPDATE, DELETE, + * or VALUES statement, there is no + * point stopping PREPARE. + */ + case T_DeallocateStmt: /* + * If prepare is allowed the deallocate should + * be allowed also + */ + case T_DoStmt: + case T_NotifyStmt: + case T_ListenStmt: + case T_UnlistenStmt: + case T_LoadStmt: + case T_ClusterStmt: + case T_VacuumStmt: + case T_ExplainStmt: + case T_VariableSetStmt: + case T_VariableShowStmt: + case T_DiscardStmt: + case T_LockStmt: + case T_ConstraintsSetStmt: + case T_CheckPointStmt: + case T_BarrierStmt: + case T_ReindexStmt: + case T_RemoteQuery: + case T_CleanConnStmt: +#ifdef XCP + case T_PauseClusterStmt: +#endif + return ALLOW; + + default: + return DISALLOW; + } + return DISALLOW; +} + +/* * GetCommentObjectId * TODO Change to return the nodes to execute the utility on * @@ -3831,18 +4087,17 @@ DropStmtPreTreatment(DropStmt *stmt, const char *queryString, bool sentToRemote, } break; - /* - * Those objects are dropped depending on the nature of the relationss - * they are defined on. This evaluation uses the temporary behavior - * and the relkind of the relation used. - */ case OBJECT_RULE: - case OBJECT_TRIGGER: { + /* + * In the case of a rule we need to find the object on + * which the rule is dependent and define if this rule + * has a dependency with a temporary object or not. + */ List *objname = linitial(stmt->objects); Relation relation = NULL; - get_object_address(stmt->removeType, + get_object_address(OBJECT_RULE, objname, NIL, &relation, AccessExclusiveLock, @@ -3850,7 +4105,7 @@ DropStmtPreTreatment(DropStmt *stmt, const char *queryString, bool sentToRemote, /* Do nothing if no relation */ if (relation && OidIsValid(relation->rd_id)) - res_exec_type = ExecUtilityFindNodes(stmt->removeType, + res_exec_type = ExecUtilityFindNodes(OBJECT_RULE, relation->rd_id, &res_is_temp); else diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c index 5582a06c7f..e33e33f67a 100644 --- a/src/backend/utils/adt/arrayfuncs.c +++ b/src/backend/utils/adt/arrayfuncs.c @@ -3,6 +3,11 @@ * arrayfuncs.c * Support functions for arrays. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -161,6 +166,40 @@ array_in(PG_FUNCTION_ARGS) lBound[MAXDIM]; ArrayMetaState *my_extra; +#ifdef XCP + /* Make a modifiable copy of the input */ + string_save = pstrdup(string); + if (*string_save == '(') + { + /* + * String representation contains prefix defining data type of array + * elements, if array has been output as anyarray. + */ + char *typnspname; + char *typname; + + /* Type namespace is started after '(' and terminated by a '.' */ + typnspname = string_save + 1; + for (p = typnspname; *p != '.'; p++) + if (*p == ')' || *p == '\0') /* dot not found */ + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid element type definition"))); + /* it is OK to modify the copy */ + *p = '\0'; + typname = p + 1; + for (p = typname; *p != ')'; p++) + if (*p == '\0') /* closing paren not found */ + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid element type definition"))); + *p = '\0'; + p++; + element_type = get_typname_typid(typname, get_namespaceid(typnspname)); + } + else + p = string_save; +#endif /* * We arrange to look up info about element type, including its input * conversion proc, only once per series of calls, assuming the element @@ -194,6 +233,7 @@ array_in(PG_FUNCTION_ARGS) typdelim = my_extra->typdelim; typioparam = my_extra->typioparam; +#ifndef XCP /* Make a modifiable copy of the input */ string_save = pstrdup(string); @@ -206,6 +246,7 @@ array_in(PG_FUNCTION_ARGS) * outer loop iterates once per dimension item. */ p = string_save; +#endif ndim = 0; for (;;) { diff --git a/src/backend/utils/adt/date.c b/src/backend/utils/adt/date.c index 6e29ebb784..03024cc242 100644 --- a/src/backend/utils/adt/date.c +++ b/src/backend/utils/adt/date.c @@ -3,6 +3,11 @@ * date.c * implements DATE and TIME data types specified in SQL-92 standard * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994-5, Regents of the University of California * @@ -24,6 +29,9 @@ #include "libpq/pqformat.h" #include "miscadmin.h" #include "parser/scansup.h" +#ifdef XCP +#include "pgxc/pgxc.h" +#endif #include "utils/array.h" #include "utils/builtins.h" #include "utils/date.h" @@ -191,7 +199,15 @@ date_out(PG_FUNCTION_ARGS) { j2date(date + POSTGRES_EPOCH_JDATE, &(tm->tm_year), &(tm->tm_mon), &(tm->tm_mday)); +#ifdef XCP + /* + * We want other nodes could parse encoded dates correctly. + * ISO date style is best suitable for that + */ + EncodeDateOnly(tm, IS_PGXC_DATANODE ? USE_ISO_DATES : DateStyle, buf); +#else EncodeDateOnly(tm, DateStyle, buf); +#endif } result = pstrdup(buf); diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index 78fc657207..35e171017e 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -2,6 +2,11 @@ * dbsize.c * Database object size functions, and related inquiries * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Copyright (c) 2002-2012, PostgreSQL Global Development Group * * IDENTIFICATION @@ -36,6 +41,14 @@ #include "utils/relmapper.h" #include "utils/lsyscache.h" #include "utils/syscache.h" +#ifdef XCP +#include "catalog/pg_type.h" +#include "catalog/pgxc_node.h" +#include "executor/executor.h" +#include "nodes/makefuncs.h" +#include "pgxc/execRemote.h" +#include "utils/snapmgr.h" +#endif #ifdef PGXC static Datum pgxc_database_size(Oid dbOid); @@ -885,7 +898,11 @@ pg_relation_filepath(PG_FUNCTION_ARGS) break; case RELPERSISTENCE_TEMP: if (isTempOrToastNamespace(relform->relnamespace)) +#ifdef XCP + backend = OidIsValid(MyCoordId) ? InvalidBackendId : MyBackendId; +#else backend = MyBackendId; +#endif else { /* Do it the hard way. */ @@ -971,16 +988,72 @@ pgxc_database_size(Oid dbOid) Datum pgxc_execute_on_nodes(int numnodes, Oid *nodelist, char *query) { +#ifndef XCP StringInfoData buf; int ret; TupleDesc spi_tupdesc; +#endif int i; int64 total_size = 0; int64 size = 0; +#ifndef XCP bool isnull; char *nodename; +#endif Datum datum; +#ifdef XCP + EState *estate; + MemoryContext oldcontext; + RemoteQuery *plan; + RemoteQueryState *pstate; + TupleTableSlot *result; + Var *dummy; + + /* + * Make up RemoteQuery plan node + */ + plan = makeNode(RemoteQuery); + plan->combine_type = COMBINE_TYPE_NONE; + plan->exec_nodes = makeNode(ExecNodes); + for (i = 0; i < numnodes; i++) + { + char ntype = PGXC_NODE_NONE; + plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, + PGXCNodeGetNodeId(nodelist[i], &ntype)); + if (ntype == PGXC_NODE_NONE) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unknown node Oid: %u", nodelist[i]))); + } + plan->sql_statement = query; + plan->force_autocommit = false; + plan->exec_type = EXEC_ON_DATANODES; + /* + * We only need the target entry to determine result data type. + * So create dummy even if real expression is a function. + */ + dummy = makeVar(1, 1, INT8OID, 0, InvalidOid, 0); + plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist, + makeTargetEntry((Expr *) dummy, 1, NULL, false)); + /* prepare to execute */ + estate = CreateExecutorState(); + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + estate->es_snapshot = GetActiveSnapshot(); + pstate = ExecInitRemoteQuery(plan, estate, 0); + MemoryContextSwitchTo(oldcontext); + + result = ExecRemoteQuery(pstate); + while (!TupIsNull(result)) + { + bool isnull; + datum = slot_getattr(result, 1, &isnull); + size = DatumGetInt64(datum); + total_size += size; + result = ExecRemoteQuery(pstate); + } + ExecEndRemoteQuery(pstate); +#else /* * Connect to SPI manager */ @@ -1022,6 +1095,7 @@ pgxc_execute_on_nodes(int numnodes, Oid *nodelist, char *query) } SPI_finish(); +#endif if (numnodes == 1) PG_RETURN_DATUM(datum); diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c index 21d26def79..1c05301c7c 100644 --- a/src/backend/utils/adt/lockfuncs.c +++ b/src/backend/utils/adt/lockfuncs.c @@ -19,6 +19,8 @@ #include "pgxc/pgxc.h" #include "pgxc/pgxcnode.h" #include "pgxc/nodemgr.h" +#include "executor/spi.h" +#include "tcop/utility.h" #endif #include "storage/predicate_internals.h" #include "utils/builtins.h" @@ -1061,3 +1063,128 @@ pg_advisory_unlock_all(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +#ifdef PGXC +/* + * pgxc_lock_for_backup + * + * Lock the cluster for taking backup + * To lock the cluster, try to acquire a session level advisory lock exclusivly + * By lock we mean to disallow any statements that change + * the portions of the catalog which are backed up by pg_dump/pg_dumpall + * Returns true or fails with an error message. + */ +Datum +pgxc_lock_for_backup(PG_FUNCTION_ARGS) +{ + bool lockAcquired = false; + int prepared_xact_count = 0; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("only superuser can lock the cluster for backup"))); + + /* + * The system cannot be locked for backup if there is an uncommitted + * prepared transaction, the reason is as follows: + * Utility statements are divided into two groups, one is allowed group + * and the other is disallowed group. A statement is put in allowed group + * if it does not make changes to the catalog or makes such changes which + * are not backed up by pg_dump or pg_dumpall, otherwise it is put in + * disallowed group. Every time a disallowed statement is issued we try to + * hold an advisory lock in shared mode and if the lock can be acquired + * only then the statement is allowed. + * In case of prepared transactions suppose the lock is not released at + * prepare transaction 'txn_id' + * Consider the following scenario: + * + * begin; + * create table abc_def(a int, b int); + * insert into abc_def values(1,2),(3,4); + * prepare transaction 'abc'; + * + * Now assume that the server is restarted for any reason. + * When prepared transactions are saved on disk, session level locks are + * ignored and hence when the prepared transactions are reterieved and all + * the other locks are reclaimed, but session level advisory locks are + * not reclaimed. + * Hence we made the following decisions + * a) Transaction level advisory locks should be used for DDLs which are + * automatically released at prepare transaction 'txn_id' + * b) If there is any uncomitted prepared transaction, it is assumed + * that it must be issuing a statement that belongs to disallowed + * group and hence the request to hold the advisory lock exclusively + * is denied. + */ + + /* Connect to SPI manager to check any prepared transactions */ + if (SPI_connect() < 0) + { + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("internal error while locking the cluster for backup"))); + } + + /* Are there any prepared transactions that have not yet been committed? */ + SPI_execute("select gid from pg_catalog.pg_prepared_xacts limit 1", true, 0); + prepared_xact_count = SPI_processed; + SPI_finish(); + + if (prepared_xact_count > 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("cannot lock cluster for backup in presence of %d uncommitted prepared transactions", + prepared_xact_count))); + } + + /* try to acquire the advisory lock in exclusive mode */ + lockAcquired = DatumGetBool(DirectFunctionCall2( + pg_try_advisory_lock_int4, + xc_lockForBackupKey1, + xc_lockForBackupKey2)); + + if (!lockAcquired) + ereport(ERROR, + (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("cannot lock cluster for backup, lock is already held"))); + + /* + * sessin level advisory locks stay for only as long as the session + * that issues them does + */ + elog(INFO, "please do not close this session until you are done adding the new node"); + + /* will be true always */ + PG_RETURN_BOOL(lockAcquired); +} + +/* + * pgxc_lock_for_backup + * + * Lock the cluster for taking backup + * To lock the cluster, try to acquire a session level advisory lock exclusivly + * By lock we mean to disallow any statements that change + * the portions of the catalog which are backed up by pg_dump/pg_dumpall + * Returns true or fails with an error message. + */ +bool +pgxc_lock_for_utility_stmt(Node *parsetree) +{ + bool lockAcquired; + + lockAcquired = DatumGetBool(DirectFunctionCall2( + pg_try_advisory_xact_lock_shared_int4, + xc_lockForBackupKey1, + xc_lockForBackupKey2)); + + if (!lockAcquired) + ereport(ERROR, + (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), + errmsg("cannot execute %s in a locked cluster", + CreateCommandTag(parsetree)))); + + return lockAcquired; +} +#endif diff --git a/src/backend/utils/adt/pseudotypes.c b/src/backend/utils/adt/pseudotypes.c index d7770b829a..39e84bf863 100644 --- a/src/backend/utils/adt/pseudotypes.c +++ b/src/backend/utils/adt/pseudotypes.c @@ -11,6 +11,11 @@ * we do better?) * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -26,7 +31,12 @@ #include "utils/array.h" #include "utils/builtins.h" #include "utils/rangetypes.h" - +#ifdef XCP +#include "access/htup.h" +#include "catalog/pg_type.h" +#include "utils/lsyscache.h" +#include "utils/syscache.h" +#endif /* * cstring_in - input routine for pseudo-type CSTRING. @@ -117,22 +127,80 @@ any_out(PG_FUNCTION_ARGS) Datum anyarray_in(PG_FUNCTION_ARGS) { +#ifdef XCP + /* + * XCP version of array_in() understands prefix describing element type + */ + return array_in(fcinfo); +#else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot accept a value of type anyarray"))); PG_RETURN_VOID(); /* keep compiler quiet */ +#endif } /* * anyarray_out - output routine for pseudo-type ANYARRAY. * * We may as well allow this, since array_out will in fact work. + * XCP needs to send from data nodes to coordinator values of that type. + * To be able to restore values at the destination node we need to know + * actual element type. */ Datum anyarray_out(PG_FUNCTION_ARGS) { +#ifdef XCP + /* + * Output prefix: (type_namespace_name.typename) to look up actual element + * type at the destination node then output in usual format for array + */ + ArrayType *v = PG_GETARG_ARRAYTYPE_P(0); + Oid element_type = ARR_ELEMTYPE(v); + Form_pg_type typeForm; + HeapTuple typeTuple; + char *typname, + *typnspname; + /* two identifiers, parenthesis, dot and trailing \0 */ + char prefix[2*NAMEDATALEN+4], + *retval, + *newval; + int prefixlen, retvallen; + Datum array_out_result; + MemoryContext save_context; + + save_context = MemoryContextSwitchTo(fcinfo->flinfo->fn_mcxt); + /* Figure out type name and type namespace */ + typeTuple = SearchSysCache(TYPEOID, + ObjectIdGetDatum(element_type), + 0, 0, 0); + if (!HeapTupleIsValid(typeTuple)) + elog(ERROR, "cache lookup failed for type %u", element_type); + typeForm = (Form_pg_type) GETSTRUCT(typeTuple); + typname = NameStr(typeForm->typname); + typnspname = get_namespace_name(typeForm->typnamespace); + + sprintf(prefix, "(%s.%s)", typnspname, typname); + ReleaseSysCache(typeTuple); + MemoryContextSwitchTo(save_context); + + /* Get standard output and make up prefixed result */ + array_out_result = array_out(fcinfo); + retval = DatumGetCString(array_out_result); + prefixlen = strlen(prefix); + retvallen = strlen(retval); + newval = (char *) palloc(prefixlen + retvallen + 1); + strcpy(newval, prefix); + strcpy(newval + prefixlen, retval); + + pfree(retval); + + PG_RETURN_CSTRING(newval); +#else return array_out(fcinfo); +#endif } /* diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c index 721f2d7e65..f4d06305e0 100644 --- a/src/backend/utils/adt/ri_triggers.c +++ b/src/backend/utils/adt/ri_triggers.c @@ -13,6 +13,11 @@ * plan --- consider improving this someday. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * * src/backend/utils/adt/ri_triggers.c @@ -267,7 +272,7 @@ RI_FKey_check(PG_FUNCTION_ARGS) int i; #ifdef PGXC - /* + /* * Referential integrity is not supported on Coordinator as it has no data, so * we just come out of the function without actually performing any integrity checks. */ @@ -2646,7 +2651,9 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel) const char *sep; int i; int save_nestlevel; +#ifndef XCP char workmembuf[32]; +#endif int spi_result; SPIPlanPtr qplan; @@ -2790,10 +2797,17 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel) */ save_nestlevel = NewGUCNestLevel(); +#ifndef XCP + /* + * In multitenant extension we restrict permission on work_mem. + * This code may be executed by ordinary user, so skip this optimization. + * XXX look for workaround + */ snprintf(workmembuf, sizeof(workmembuf), "%d", maintenance_work_mem); (void) set_config_option("work_mem", workmembuf, PGC_USERSET, PGC_S_SESSION, GUC_ACTION_SAVE, true, 0); +#endif if (SPI_connect() != SPI_OK_CONNECT) elog(ERROR, "SPI_connect failed"); diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index ad3fbb3b4b..7fbaeef351 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -4,6 +4,11 @@ * Functions to convert stored expressions/querytrees back to * source text * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -56,7 +61,7 @@ #include "parser/parsetree.h" #ifdef PGXC #include "pgxc/pgxc.h" -#include "optimizer/pgxcplan.h" +#include "pgxc/planner.h" #endif #include "rewrite/rewriteHandler.h" #include "rewrite/rewriteManip.h" @@ -71,6 +76,7 @@ #include "utils/typcache.h" #include "utils/xml.h" + /* ---------- * Pretty formatting constants * ---------- @@ -109,10 +115,9 @@ typedef struct int indentLevel; /* current indent level for prettyprint */ bool varprefix; /* TRUE to print prefixes on Vars */ #ifdef PGXC +#ifndef XCP bool finalise_aggs; /* should Datanode finalise the aggregates? */ - bool sortgroup_colno;/* instead of expression use resno for - * sortgrouprefs. - */ +#endif /* XCP */ #endif /* PGXC */ } deparse_context; @@ -210,11 +215,7 @@ static void make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, static void make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, int prettyFlags); static void get_query_def(Query *query, StringInfo buf, List *parentnamespace, - TupleDesc resultDesc, int prettyFlags, int startIndent -#ifdef PGXC - , bool finalise_aggregates, bool sortgroup_colno -#endif /* PGXC */ - ); + TupleDesc resultDesc, int prettyFlags, int startIndent); static void get_values_def(List *values_lists, deparse_context *context); static void get_with_clause(Query *query, deparse_context *context); static void get_select_query_def(Query *query, deparse_context *context, @@ -729,7 +730,9 @@ pg_get_triggerdef_worker(Oid trigid, bool pretty) context.varprefix = true; context.prettyFlags = pretty ? PRETTYFLAG_PAREN : 0; #ifdef PGXC +#ifndef XCP context.finalise_aggs = false; +#endif /* XCP */ #endif /* PGXC */ context.indentLevel = PRETTYINDENT_STD; @@ -2179,7 +2182,9 @@ deparse_expression_pretty(Node *expr, List *dpcontext, context.varprefix = forceprefix; context.prettyFlags = prettyFlags; #ifdef PGXC +#ifndef XCP context.finalise_aggs = false; +#endif /* XCP */ #endif /* PGXC */ context.indentLevel = startIndent; @@ -2224,6 +2229,36 @@ deparse_context_for(const char *aliasname, Oid relid) return list_make1(dpns); } +#ifdef PGXC +List * +deparse_context_for_remotequery(Alias *aliasname, Oid relid) +{ + deparse_namespace *dpns; + RangeTblEntry *rte; + + dpns = (deparse_namespace *) palloc(sizeof(deparse_namespace)); + + /* Build a minimal RTE for the rel */ + rte = makeNode(RangeTblEntry); + rte->rtekind = RTE_RELATION; + rte->relid = relid; + rte->eref = aliasname; + rte->inh = false; + rte->inFromCl = true; + + /* Build one-element rtable */ + dpns->rtable = list_make1(rte); + dpns->ctes = NIL; + dpns->planstate = NULL; + dpns->ancestors = NIL; + dpns->outer_planstate = dpns->inner_planstate = NULL; + dpns->remotequery = true; + + /* Return a one-deep namespace stack */ + return list_make1(dpns); +} +#endif + /* * deparse_context_for_planstate - Build deparse context for a plan * @@ -2663,7 +2698,9 @@ make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, context.prettyFlags = prettyFlags; context.indentLevel = PRETTYINDENT_STD; #ifdef PGXC +#ifndef XCP context.finalise_aggs = false; +#endif /* XCP */ #endif /* PGXC */ memset(&dpns, 0, sizeof(dpns)); @@ -2691,11 +2728,7 @@ make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, foreach(action, actions) { query = (Query *) lfirst(action); - get_query_def(query, buf, NIL, NULL, prettyFlags, 0 -#ifdef PGXC - , false, false -#endif /* PGXC */ - ); + get_query_def(query, buf, NIL, NULL, prettyFlags, 0); if (prettyFlags) appendStringInfo(buf, ";\n"); else @@ -2712,11 +2745,7 @@ make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, Query *query; query = (Query *) linitial(actions); - get_query_def(query, buf, NIL, NULL, prettyFlags, 0 -#ifdef PGXC - , false, false -#endif /* PGXC */ - ); + get_query_def(query, buf, NIL, NULL, prettyFlags, 0); appendStringInfo(buf, ";"); } } @@ -2784,11 +2813,7 @@ make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, ev_relation = heap_open(ev_class, AccessShareLock); get_query_def(query, buf, NIL, RelationGetDescr(ev_relation), - prettyFlags, 0 -#ifdef PGXC - , false, false -#endif /* PGXC */ - ); + prettyFlags, 0); appendStringInfo(buf, ";"); heap_close(ev_relation, AccessShareLock); @@ -2804,11 +2829,168 @@ make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, * ---------- */ void -deparse_query(Query *query, StringInfo buf, List *parentnamespace, - bool finalise_aggs, bool sortgroup_colno) +deparse_query(Query *query, StringInfo buf, List *parentnamespace) +{ + get_query_def(query, buf, parentnamespace, NULL, 0, 0); +} + +/* code borrowed from get_insert_query_def */ +void +get_query_def_from_valuesList(Query *query, StringInfo buf) { - get_query_def(query, buf, parentnamespace, NULL, 0, 0, finalise_aggs, - sortgroup_colno); + + RangeTblEntry *select_rte = NULL; + RangeTblEntry *values_rte = NULL; + RangeTblEntry *rte; + char *sep; + ListCell *values_cell; + ListCell *l; + List *strippedexprs; + deparse_context context; + deparse_namespace dpns; + + /* + * Before we begin to examine the query, acquire locks on referenced + * relations, and fix up deleted columns in JOIN RTEs. This ensures + * consistent results. Note we assume it's OK to scribble on the passed + * querytree! + */ + AcquireRewriteLocks(query, false); + + context.buf = buf; + context.namespaces = NIL; + context.windowClause = NIL; + context.windowTList = NIL; + context.varprefix = (list_length(query->rtable) != 1); + context.prettyFlags = 0; + context.indentLevel = 0; +#ifdef PGXC +#ifndef XCP + context.finalise_aggs = query->qry_finalise_aggs; +#endif /* XCP */ +#endif /* PGXC */ + + dpns.rtable = query->rtable; + dpns.ctes = query->cteList; + dpns.planstate = NULL; + dpns.ancestors = NIL; + dpns.outer_planstate = dpns.inner_planstate = NULL; + dpns.remotequery = false; + + /* + * If it's an INSERT ... SELECT or VALUES (...), (...), ... there will be + * a single RTE for the SELECT or VALUES. + */ + foreach(l, query->rtable) + { + rte = (RangeTblEntry *) lfirst(l); + + if (rte->rtekind == RTE_SUBQUERY) + { + if (select_rte) + elog(ERROR, "too many subquery RTEs in INSERT"); + select_rte = rte; + } + + if (rte->rtekind == RTE_VALUES) + { + if (values_rte) + elog(ERROR, "too many values RTEs in INSERT"); + values_rte = rte; + } + } + if (select_rte && values_rte) + elog(ERROR, "both subquery and values RTEs in INSERT"); + + /* + * Start the query with INSERT INTO relname + */ + rte = rt_fetch(query->resultRelation, query->rtable); + Assert(rte->rtekind == RTE_RELATION); + + appendStringInfo(buf, "INSERT INTO %s (", + generate_relation_name(rte->relid, NIL)); + + /* + * Add the insert-column-names list. To handle indirection properly, we + * need to look for indirection nodes in the top targetlist (if it's + * INSERT ... SELECT or INSERT ... single VALUES), or in the first + * expression list of the VALUES RTE (if it's INSERT ... multi VALUES). We + * assume that all the expression lists will have similar indirection in + * the latter case. + */ + if (values_rte) + values_cell = list_head((List *) linitial(values_rte->values_lists)); + else + values_cell = NULL; + strippedexprs = NIL; + sep = ""; + foreach(l, query->targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + + elog(DEBUG1, "targetEntry type is %d\n)", tle->expr->type); + if (tle->resjunk || !IsA(tle->expr, Var)) + continue; /* ignore junk entries */ + + appendStringInfoString(buf, sep); + sep = ", "; + + /* + * Put out name of target column; look in the catalogs, not at + * tle->resname, since resname will fail to track RENAME. + */ + appendStringInfoString(buf,quote_identifier(get_relid_attribute_name(rte->relid, tle->resno))); + + /* + * Print any indirection needed (subfields or subscripts), and strip + * off the top-level nodes representing the indirection assignments. + */ + if (values_cell) + { + /* we discard the stripped expression in this case */ + processIndirection((Node *) lfirst(values_cell), &context, true); + values_cell = lnext(values_cell); + } + else + { + /* we keep a list of the stripped expressions in this case */ + strippedexprs = lappend(strippedexprs, processIndirection((Node *) tle->expr, &context, true)); + } + } + appendStringInfo(buf, ") "); + + if (select_rte) + { + /* Add the SELECT */ + get_query_def(select_rte->subquery, buf, NIL, NULL, + context.prettyFlags, context.indentLevel); + } + else if (values_rte) + { + /* A WITH clause is possible here */ + get_with_clause(query, &context); + /* Add the multi-VALUES expression lists */ + get_values_def(values_rte->values_lists, &context); + } + else + { + /* A WITH clause is possible here */ + get_with_clause(query, &context); + /* Add the single-VALUES expression list */ + appendContextKeyword(&context, "VALUES (", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 2); + get_rule_expr((Node *) strippedexprs, &context, false); + appendStringInfoChar(buf, ')'); + } + + /* Add RETURNING if present */ + if (query->returningList) + { + appendContextKeyword(&context, " RETURNING", + -PRETTYINDENT_STD, PRETTYINDENT_STD, 1); + get_target_list(query->returningList, &context, NULL); + } } #endif /* ---------- @@ -2820,11 +3002,7 @@ deparse_query(Query *query, StringInfo buf, List *parentnamespace, */ static void get_query_def(Query *query, StringInfo buf, List *parentnamespace, - TupleDesc resultDesc, int prettyFlags, int startIndent -#ifdef PGXC - , bool finalise_aggs, bool sortgroup_colno -#endif /* PGXC */ - ) + TupleDesc resultDesc, int prettyFlags, int startIndent) { deparse_context context; deparse_namespace dpns; @@ -2846,8 +3024,9 @@ get_query_def(Query *query, StringInfo buf, List *parentnamespace, context.prettyFlags = prettyFlags; context.indentLevel = startIndent; #ifdef PGXC - context.finalise_aggs = finalise_aggs; - context.sortgroup_colno = sortgroup_colno; +#ifndef XCP + context.finalise_aggs = query->qry_finalise_aggs; +#endif /* XCP */ #endif /* PGXC */ memset(&dpns, 0, sizeof(dpns)); @@ -2986,11 +3165,7 @@ get_with_clause(Query *query, deparse_context *context) if (PRETTY_INDENT(context)) appendContextKeyword(context, "", 0, 0, 0); get_query_def((Query *) cte->ctequery, buf, context->namespaces, NULL, - context->prettyFlags, context->indentLevel -#ifdef PGXC - , context->finalise_aggs, context->sortgroup_colno -#endif /* PGXC */ - ); + context->prettyFlags, context->indentLevel); if (PRETTY_INDENT(context)) appendContextKeyword(context, "", 0, 0, 0); appendStringInfoChar(buf, ')'); @@ -3393,11 +3568,7 @@ get_setop_query(Node *setOp, Query *query, deparse_context *context, if (need_paren) appendStringInfoChar(buf, '('); get_query_def(subquery, buf, context->namespaces, resultDesc, - context->prettyFlags, context->indentLevel -#ifdef PGXC - , context->finalise_aggs, context->sortgroup_colno -#endif /* PGXC */ - ); + context->prettyFlags, context->indentLevel); if (need_paren) appendStringInfoChar(buf, ')'); } @@ -3493,7 +3664,7 @@ get_rule_sortgroupclause(SortGroupClause *srt, List *tlist, bool force_colno, * dump it without any decoration. Otherwise, just dump the expression * normally. */ - if (force_colno || context->sortgroup_colno) + if (force_colno) { Assert(!tle->resjunk); appendStringInfo(buf, "%d", tle->resno); @@ -3716,6 +3887,7 @@ get_insert_query_def(Query *query, deparse_context *context) get_with_clause(query, context); #ifdef PGXC +#ifndef XCP /* * In the case of "INSERT ... DEFAULT VALUES" analyzed in pgxc planner, * return the sql statement directly if the table has no default values. @@ -3725,32 +3897,9 @@ get_insert_query_def(Query *query, deparse_context *context) appendStringInfo(buf, "%s", query->sql_statement); return; } - - /* - * select_rte and values_rte are not required by INSERT queries in XC - * Both these should stay null for INSERT queries to work corretly - * Consider an example - * create table tt as values(1,'One'),(2,'Two'); - * This query uses values_rte, but we do not need them in XC - * because it gets broken down into two queries - * CREATE TABLE tt(column1 int4, column2 text) - * and - * INSERT INTO tt (column1, column2) VALUES ($1, $2) - * Note that the insert query does not need values_rte - * - * Now consider another example - * insert into tt select * from tt - * This query uses select_rte, but again that is not required in XC - * Again here the query gets broken down into two queries - * SELECT column1, column2 FROM ONLY tt WHERE true - * and - * INSERT INTO tt (column1, column2) VALUES ($1, $2) - * Note again that the insert query does not need select_rte - * Hence we keep both select_rte and values_rte NULL. - */ - if (!(IS_PGXC_COORDINATOR && !IsConnFromCoord())) - { #endif +#endif + /* * If it's an INSERT ... SELECT or VALUES (...), (...), ... there will be * a single RTE for the SELECT or VALUES. @@ -3773,11 +3922,23 @@ get_insert_query_def(Query *query, deparse_context *context) values_rte = rte; } } + if (select_rte && values_rte) + elog(ERROR, "both subquery and values RTEs in INSERT"); + #ifdef PGXC +#ifndef XCP + /* + * If it's an INSERT ... SELECT or VALUES (...), (...), ... + * sql_statement is rewritten and assigned in RewriteQuery. + * Just return it here. + */ + if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && values_rte != NULL) + { + appendStringInfo(buf, "%s", query->sql_statement); + return; } #endif - if (select_rte && values_rte) - elog(ERROR, "both subquery and values RTEs in INSERT"); +#endif /* * Start the query with INSERT INTO relname */ @@ -3848,11 +4009,7 @@ get_insert_query_def(Query *query, deparse_context *context) { /* Add the SELECT */ get_query_def(select_rte->subquery, buf, NIL, NULL, - context->prettyFlags, context->indentLevel -#ifdef PGXC - , context->finalise_aggs, context->sortgroup_colno -#endif /* PGXC */ - ); + context->prettyFlags, context->indentLevel); } else if (values_rte) { @@ -4073,6 +4230,12 @@ get_utility_query_def(Query *query, deparse_context *context) { ColumnDef *coldef = (ColumnDef *) node; TypeName *typename = coldef->typeName; +#ifdef XCP + appendStringInfo(buf, "%s %s", + quote_identifier(coldef->colname), + format_type_with_typemod(typename->typeOid, + typename->typemod)); +#else Type type; /* error out if we have no recourse at all */ @@ -4092,6 +4255,7 @@ get_utility_query_def(Query *query, deparse_context *context) appendStringInfo(buf, "%s %s", quote_identifier(coldef->colname), typeTypeName(type)); ReleaseSysCache(type); +#endif } else elog(ERROR, "Invalid table column definition."); @@ -6563,6 +6727,7 @@ get_agg_expr(Aggref *aggref, deparse_context *context) } #ifdef PGXC +#ifndef XCP /* * Datanode should send finalised aggregate results. Datanodes evaluate only * transition results. In order to get the finalised aggregate, we enclose @@ -6589,6 +6754,7 @@ get_agg_expr(Aggref *aggref, deparse_context *context) } ReleaseSysCache(aggTuple); } +#endif /* XCP */ #endif /* PGXC */ appendStringInfo(buf, "%s(%s", @@ -7018,11 +7184,7 @@ get_sublink_expr(SubLink *sublink, deparse_context *context) appendStringInfoChar(buf, '('); get_query_def(query, buf, context->namespaces, NULL, - context->prettyFlags, context->indentLevel -#ifdef PGXC - , context->finalise_aggs, context->sortgroup_colno -#endif /* PGXC */ - ); + context->prettyFlags, context->indentLevel); if (need_paren) appendStringInfo(buf, "))"); @@ -7144,11 +7306,7 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context) /* Subquery RTE */ appendStringInfoChar(buf, '('); get_query_def(rte->subquery, buf, context->namespaces, NULL, - context->prettyFlags, context->indentLevel, -#ifdef PGXC - context->finalise_aggs, context->sortgroup_colno -#endif /* PGXC */ - ); + context->prettyFlags, context->indentLevel); appendStringInfoChar(buf, ')'); break; case RTE_FUNCTION: diff --git a/src/backend/utils/adt/version.c b/src/backend/utils/adt/version.c index df0bab74f5..5dbd742b6f 100644 --- a/src/backend/utils/adt/version.c +++ b/src/backend/utils/adt/version.c @@ -3,6 +3,11 @@ * version.c * Returns the PostgreSQL version string * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Copyright (c) 1998-2012, PostgreSQL Global Development Group * * IDENTIFICATION @@ -24,9 +29,11 @@ pgsql_version(PG_FUNCTION_ARGS) } #ifdef PGXC +#ifndef XCP Datum pgxc_version(PG_FUNCTION_ARGS) { PG_RETURN_TEXT_P(cstring_to_text(PGXC_VERSION_STR)); } #endif +#endif diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index 9ccfc4f114..1c189fab6a 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -85,6 +85,11 @@ * problems can be overcome cheaply. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -98,6 +103,9 @@ #include "access/xact.h" #include "catalog/catalog.h" #include "miscadmin.h" +#ifdef XCP +#include "pgxc/pgxc.h" +#endif #include "storage/sinval.h" #include "storage/smgr.h" #include "utils/inval.h" @@ -831,7 +839,18 @@ ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs, void AtEOXact_Inval(bool isCommit) { +#ifdef XCP + /* + * In our code, the distributed session may run on multiple backends, + * and we need to broadcast invalidation messages so they reach other + * backends even * in case of rollback. If the session runs on single + * backend the invalidation messages may be still applied locally. + * So the criteria may be more complex. + */ + if (isCommit || IS_PGXC_DATANODE) +#else if (isCommit) +#endif { /* Must be at top of stack */ Assert(transInvalInfo != NULL && transInvalInfo->parent == NULL); diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index db996829b0..8971b05b3a 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -3,6 +3,11 @@ * lsyscache.c * Convenience routines for common queries in the system catalog cache. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -1005,6 +1010,78 @@ get_collation_name(Oid colloid) return NULL; } + +#ifdef XCP +/* + * get_collation_namespace + * Returns the namespace id of a given pg_collation entry. + * + * Returns an Oid of the collation's namespace. + */ +Oid +get_collation_namespace(Oid colloid) +{ + HeapTuple tp; + + tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(colloid)); + if (HeapTupleIsValid(tp)) + { + Form_pg_collation colltup = (Form_pg_collation) GETSTRUCT(tp); + Oid result; + + result = colltup->collnamespace; + ReleaseSysCache(tp); + return result; + } + else + return InvalidOid; +} + + +/* + * get_collation_encoding + * Returns the encoding of a given pg_collation entry. + * + * Returns the collation's encoding, or -1 if entry does not exist. + */ +int32 +get_collation_encoding(Oid colloid) +{ + HeapTuple tp; + + tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(colloid)); + if (HeapTupleIsValid(tp)) + { + Form_pg_collation colltup = (Form_pg_collation) GETSTRUCT(tp); + int32 result; + + result = colltup->collencoding; + ReleaseSysCache(tp); + return result; + } + else + return -1; +} + + +/* + * get_collid + * Given a collation name, encoding and namespace OID, look up + * the collation OID. + * + * Returns InvalidOid if there is no such collation + */ +Oid +get_collid(const char *collname, int32 collencoding, Oid collnsp) +{ + return GetSysCacheOid(COLLNAMEENCNSP, + CStringGetDatum(collname), + Int32GetDatum(collencoding), + ObjectIdGetDatum(collnsp), + 0); +} +#endif + /* ---------- CONSTRAINT CACHE ---------- */ /* @@ -3172,6 +3249,159 @@ get_namespace_name(Oid nspid) return NULL; } + +#ifdef XCP +/* + * Routines to get info to encode/decode oids when sending between nodes + */ + +/* + * get_namespaceid + * Given a namespace name, look up the namespace OID. + * + * Returns InvalidOid if there is no such namespace + */ +Oid +get_namespaceid(const char *nspname) +{ + return GetSysCacheOid(NAMESPACENAME, + CStringGetDatum(nspname), + 0, 0, 0); +} + +/* + * get_typ_name + * + * Given the type OID, find the type name + * It returns palloc'd copy of the name or NULL if the cache lookup fails... + */ +char * +get_typ_name(Oid typid) +{ + HeapTuple tp; + + tp = SearchSysCache(TYPEOID, + ObjectIdGetDatum(typid), + 0, 0, 0); + if (HeapTupleIsValid(tp)) + { + Form_pg_type typtup = (Form_pg_type) GETSTRUCT(tp); + char *result; + + result = pstrdup(NameStr(typtup->typname)); + ReleaseSysCache(tp); + return result; + } + else + return NULL; +} + +/* + * get_typ_namespace + * + * Given the type OID, find the namespace + * It returns InvalidOid if the cache lookup fails... + */ +Oid +get_typ_namespace(Oid typid) +{ + HeapTuple tp; + + tp = SearchSysCache(TYPEOID, + ObjectIdGetDatum(typid), + 0, 0, 0); + if (HeapTupleIsValid(tp)) + { + Form_pg_type typtup = (Form_pg_type) GETSTRUCT(tp); + Oid result; + + result = typtup->typnamespace; + ReleaseSysCache(tp); + return result; + } + else + return InvalidOid; +} + +/* + * get_typname_typid + * Given a type name and namespace OID, look up the type OID. + * + * Returns InvalidOid if there is no such type + */ +Oid +get_typname_typid(const char *typname, Oid typnamespace) +{ + return GetSysCacheOid(TYPENAMENSP, + CStringGetDatum(typname), + ObjectIdGetDatum(typnamespace), + 0, 0); +} + +/* + * get_funcid + * Given a function name, argument types and namespace OID, look up + * the function OID. + * + * Returns InvalidOid if there is no such function + */ +Oid +get_funcid(const char *funcname, oidvector *argtypes, Oid funcnsp) +{ + return GetSysCacheOid(PROCNAMEARGSNSP, + CStringGetDatum(funcname), + PointerGetDatum(argtypes), + ObjectIdGetDatum(funcnsp), + 0); +} + +/* + * get_opnamespace + * Given an opno, find the namespace + * + * Returns InvalidOid if there is no such operator + */ +Oid +get_opnamespace(Oid opno) +{ + HeapTuple tp; + + tp = SearchSysCache(OPEROID, + ObjectIdGetDatum(opno), + 0, 0, 0); + if (HeapTupleIsValid(tp)) + { + Form_pg_operator optup = (Form_pg_operator) GETSTRUCT(tp); + Oid result; + + result = optup->oprnamespace; + ReleaseSysCache(tp); + return result; + } + else + return InvalidOid; +} + +/* + * get_operid + * Given an operator name, argument types and namespace OID, look up + * the operator OID. + * + * Returns InvalidOid if there is no such operator + */ +Oid +get_operid(const char *oprname, Oid oprleft, Oid oprright, Oid oprnsp) +{ + return GetSysCacheOid(OPERNAMENSP, + CStringGetDatum(oprname), + ObjectIdGetDatum(oprleft), + ObjectIdGetDatum(oprright), + ObjectIdGetDatum(oprnsp)); +} + +#endif + + /* ---------- PG_RANGE CACHE ---------- */ /* diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c index ab8d8a491e..c34d514997 100644 --- a/src/backend/utils/cache/plancache.c +++ b/src/backend/utils/cache/plancache.c @@ -35,6 +35,11 @@ * be infrequent enough that more-detailed tracking is not worth the effort. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -66,6 +71,9 @@ #ifdef PGXC #include "commands/prepare.h" #include "pgxc/execRemote.h" +#ifdef XCP +#include "pgxc/squeue.h" +#endif #include "pgxc/pgxc.h" @@ -319,8 +327,10 @@ CompleteCachedPlan(CachedPlanSource *plansource, plansource->cursor_options = cursor_options; plansource->fixed_result = fixed_result; #ifdef PGXC +#ifndef XCP plansource->stmt_name = NULL; #endif +#endif plansource->resultDesc = PlanCacheComputeResultDesc(querytree_list); MemoryContextSwitchTo(oldcxt); @@ -438,6 +448,7 @@ ReleaseGenericPlan(CachedPlanSource *plansource) CachedPlan *plan = plansource->gplan; #ifdef PGXC +#ifndef XCP /* Drop this plan on remote nodes */ if (plan) { @@ -456,6 +467,19 @@ ReleaseGenericPlan(CachedPlanSource *plansource) } } #endif +#endif + +#ifdef XCP + /* Release SharedQueue if still held */ + if (IsConnFromDatanode() && plan && list_length(plan->stmt_list) == 1) + { + PlannedStmt *pstmt; + + pstmt = (PlannedStmt *) linitial(plan->stmt_list); + if (IsA(pstmt, PlannedStmt) && pstmt->pname) + SharedQueueRelease(pstmt->pname); + } +#endif Assert(plan->magic == CACHEDPLAN_MAGIC); plansource->gplan = NULL; @@ -535,6 +559,9 @@ RevalidateCachedQuery(CachedPlanSource *plansource) MemoryContextDelete(qcxt); } + /* Drop the generic plan reference if any */ + ReleaseGenericPlan(plansource); + /* * Now re-do parse analysis and rewrite. This not incidentally acquires * the locks we need to do planning safely. @@ -1141,9 +1168,7 @@ ReleaseCachedPlan(CachedPlan *plan, bool useResOwner) Assert(plan->refcount > 0); plan->refcount--; if (plan->refcount == 0) - { MemoryContextDelete(plan->context); - } } /* @@ -1536,6 +1561,9 @@ PlanCacheComputeResultDesc(List *stmt_list) switch (ChoosePortalStrategy(stmt_list)) { +#ifdef XCP + case PORTAL_DISTRIBUTED: +#endif case PORTAL_ONE_SELECT: case PORTAL_ONE_MOD_WITH: query = (Query *) linitial(stmt_list); @@ -1758,3 +1786,99 @@ ResetPlanCache(void) } } } + + +#ifdef XCP +void +SetRemoteSubplan(CachedPlanSource *plansource, const char *plan_string) +{ + CachedPlan *plan; + MemoryContext plan_context; + MemoryContext oldcxt; + RemoteStmt *rstmt; + PlannedStmt *stmt; + + Assert(IS_PGXC_DATANODE); + Assert(plansource->raw_parse_tree == NULL); + Assert(plansource->query_list == NIL); + + /* + * Make dedicated query context to store cached plan. It is in current + * memory context for now, later it will be reparented to + * CachedMemoryContext. If it is in CachedMemoryContext initially we would + * have to destroy it in case of error. + */ + plan_context = AllocSetContextCreate(CurrentMemoryContext, + "CachedPlan", + ALLOCSET_SMALL_MINSIZE, + ALLOCSET_SMALL_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcxt = MemoryContextSwitchTo(plan_context); + + /* + * Restore query plan. + */ + set_portable_input(true); + rstmt = (RemoteStmt *) stringToNode((char *) plan_string); + set_portable_input(false); + + stmt = makeNode(PlannedStmt); + + stmt->commandType = rstmt->commandType; + stmt->hasReturning = rstmt->hasReturning; + stmt->canSetTag = true; + stmt->transientPlan = false; // ??? + stmt->planTree = rstmt->planTree; + stmt->rtable = rstmt->rtable; + stmt->resultRelations = rstmt->resultRelations; + stmt->utilityStmt = NULL; + stmt->subplans = rstmt->subplans; + stmt->rewindPlanIDs = NULL; + stmt->rowMarks = rstmt->rowMarks; + stmt->relationOids = NIL; + stmt->invalItems = NIL; + stmt->nParamExec = rstmt->nParamExec; + stmt->nParamRemote = rstmt->nParamRemote; + stmt->remoteparams = rstmt->remoteparams; + stmt->pname = plansource->stmt_name; + stmt->distributionType = rstmt->distributionType; + stmt->distributionKey = rstmt->distributionKey; + stmt->distributionNodes = rstmt->distributionNodes; + stmt->distributionRestrict = rstmt->distributionRestrict; + + /* + * Set up SharedQueue if intermediate results need to be distributed + * on multiple destination Datanodes. + */ + if (IsConnFromDatanode() && stmt->pname && + list_length(stmt->distributionRestrict) > 1) + SharedQueueAcquire(stmt->pname, + list_length(stmt->distributionRestrict) - 1); + + /* + * Create and fill the CachedPlan struct within the new context. + */ + plan = (CachedPlan *) palloc(sizeof(CachedPlan)); + plan->magic = CACHEDPLAN_MAGIC; + plan->stmt_list = list_make1(stmt); + plan->saved_xmin = InvalidTransactionId; + plan->refcount = 1; /* will be referenced by plansource */ + plan->context = plan_context; + if (plansource->is_saved) + { + MemoryContextSetParent(plan_context, CacheMemoryContext); + plan->is_saved = true; + } + else + { + MemoryContextSetParent(plan_context, + MemoryContextGetParent(plansource->context)); + plan->is_saved = false; + } + plan->is_valid = true; + + plansource->gplan = plan; + + MemoryContextSwitchTo(oldcxt); +} +#endif diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 4df7547e1c..dcdae41fdf 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -3,6 +3,11 @@ * relcache.c * POSTGRES relation descriptor cache code * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -860,7 +865,15 @@ RelationBuildDesc(Oid targetRelId, bool insertIt) break; case RELPERSISTENCE_TEMP: if (isTempOrToastNamespace(relation->rd_rel->relnamespace)) + { +#ifdef XCP + relation->rd_backend = OidIsValid(MyCoordId) ? + MyFirstBackendId : MyBackendId; +#else + relation->rd_backend = MyBackendId; +#endif + } else { /* @@ -901,9 +914,14 @@ RelationBuildDesc(Oid targetRelId, bool insertIt) relation->trigdesc = NULL; #ifdef PGXC +#ifdef XCP + if (IS_PGXC_COORDINATOR && + relation->rd_id >= FirstNormalObjectId) +#else if (IS_PGXC_COORDINATOR && relation->rd_id >= FirstNormalObjectId && !IsAutoVacuumWorkerProcess()) +#endif RelationBuildLocator(relation); #endif /* @@ -2542,6 +2560,11 @@ RelationBuildLocalRelation(const char *relname, rel->rd_backend = InvalidBackendId; break; case RELPERSISTENCE_TEMP: +#ifdef XCP + if (OidIsValid(MyCoordId)) + rel->rd_backend = MyFirstBackendId; + else +#endif rel->rd_backend = MyBackendId; break; default: @@ -2905,6 +2928,7 @@ RelationCacheInitializePhase3(void) TriggerRelationId); #define NUM_CRITICAL_LOCAL_INDEXES 7 /* fix if you change list above */ + criticalRelcachesBuilt = true; } diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt index 3e04164956..ca8a543314 100644 --- a/src/backend/utils/errcodes.txt +++ b/src/backend/utils/errcodes.txt @@ -461,3 +461,4 @@ Section: Class XX - Internal Error XX000 E ERRCODE_INTERNAL_ERROR internal_error XX001 E ERRCODE_DATA_CORRUPTED data_corrupted XX002 E ERRCODE_INDEX_CORRUPTED index_corrupted +XX010 E ERRCODE_PRODUCER_ERROR diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index fb46ab7218..3a78d7ab17 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -3,6 +3,11 @@ * globals.c * global variable declarations * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -59,6 +64,14 @@ char postgres_exec_path[MAXPGPATH]; /* full path to backend */ /* note: currently this is not valid in backend processes */ #endif +#ifdef XCP +Oid MyCoordId = InvalidOid; + +int MyCoordPid = 0; + +BackendId MyFirstBackendId = InvalidBackendId; +#endif + BackendId MyBackendId = InvalidBackendId; Oid MyDatabaseId = InvalidOid; diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 6b2833e1cb..ce4bb71366 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -3,6 +3,11 @@ * miscinit.c * miscellaneous initialization support stuff * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -29,9 +34,15 @@ #include <utime.h> #endif +#ifdef XCP +#include "catalog/namespace.h" +#endif #include "catalog/pg_authid.h" #include "mb/pg_wchar.h" #include "miscadmin.h" +#ifdef XCP +#include "pgxc/execRemote.h" +#endif #include "postmaster/autovacuum.h" #include "postmaster/postmaster.h" #include "storage/fd.h" @@ -543,6 +554,117 @@ SetSessionAuthorization(Oid userid, bool is_superuser) PGC_INTERNAL, PGC_S_OVERRIDE); } + +#ifdef XCP +void +SetGlobalSession(Oid coordid, int coordpid) +{ + bool reset = false; + BackendId firstBackend = InvalidBackendId; + int bCount = 0; + int bPids[MaxBackends]; + + /* If nothing changed do nothing */ + if (MyCoordId == coordid && MyCoordPid == coordpid) + return; + + /* + * Need to reset pool manager agent if the backend being assigned to + * different global session or assignment is canceled. + */ + if (OidIsValid(MyCoordId) && + (MyCoordId != coordid || MyCoordPid != coordpid)) + reset = true; + +retry: + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + /* Expose distributed session id in the PGPROC structure */ + MyProc->coordId = coordid; + MyProc->coordPid = coordpid; + /* + * Determine first backend id. + * If this backend is the first backend of the distributed session on the + * node we should clean up the temporary namespace. + * Backend is the first if no backends with such distributed session id. + * If such backends are found we can copy first found valid firstBackendId. + * If none of them valid that means the first is still cleaning up the + * temporary namespace. + */ + if (OidIsValid(coordid)) + firstBackend = GetFirstBackendId(&bCount, bPids); + else + firstBackend = InvalidBackendId; + /* If first backend id is defined set it right now */ + if (firstBackend != InvalidBackendId) + MyProc->firstBackendId = firstBackend; + LWLockRelease(ProcArrayLock); + + if (OidIsValid(coordid) && firstBackend == InvalidBackendId) + { + /* + * We are the first or need to retry + */ + if (bCount > 0) + { + /* XXX sleep ? */ + goto retry; + } + else + { + /* Set globals for this backend */ + MyCoordId = coordid; + MyCoordPid = coordpid; + MyFirstBackendId = MyBackendId; + /* XXX Maybe this lock is not needed because of atomic operation? */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + MyProc->firstBackendId = MyBackendId; + LWLockRelease(ProcArrayLock); + } + } + else + { + /* Set globals for this backend */ + MyCoordId = coordid; + MyCoordPid = coordpid; + MyFirstBackendId = firstBackend; + } + + if (reset) + { + /* + * Next time when backend will be assigned to a global session it will + * be referencing different temp namespace + */ + ForgetTempTableNamespace(); + /* + * Forget all local and session parameters cached for the Datanodes. + * They do not belong to that session. + */ + PGXCNodeResetParams(false); + /* + * Release node connections, if still held. + */ + release_handles(); + /* + * XXX Do other stuff like release secondary Datanode connections, + * clean up shared queues ??? + */ + } +} + + +/* + * Returns the name of the role that should be used to access other cluster + * nodes. + */ +char * +GetClusterUserName(void) +{ + return GetUserNameFromId(AuthenticatedUserId); +} +#endif + + /* * Report current role id * This follows the semantics of SET ROLE, ie return the outer-level ID diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 93da70681b..282a74666a 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -3,6 +3,11 @@ * postinit.c * postgres initialization utilities * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -34,6 +39,9 @@ #include "mb/pg_wchar.h" #include "miscadmin.h" #include "pgstat.h" +#ifdef XCP +#include "pgxc/pgxc.h" +#endif #include "postmaster/autovacuum.h" #include "postmaster/postmaster.h" #include "replication/walsender.h" @@ -305,6 +313,9 @@ CheckMyDatabase(const char *name, bool am_superuser) * just document that the connection limit is approximate. */ if (dbform->datconnlimit >= 0 && +#ifdef XCP + IS_PGXC_COORDINATOR && +#endif !am_superuser && CountDBBackends(MyDatabaseId) > dbform->datconnlimit) ereport(FATAL, diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index e5d95457d7..cdd82bcdc4 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -6,6 +6,11 @@ * See src/backend/utils/misc/README for more information. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Copyright (c) 2000-2012, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * Written by Peter Eisentraut <[email protected]>. @@ -59,14 +64,20 @@ #ifdef PGXC #include "commands/tablecmds.h" #include "nodes/nodes.h" -#include "optimizer/pgxcship.h" #include "pgxc/execRemote.h" #include "pgxc/locator.h" -#include "optimizer/pgxcplan.h" +#include "pgxc/planner.h" #include "pgxc/poolmgr.h" #include "pgxc/nodemgr.h" #include "pgxc/xc_maintenance_mode.h" #endif +#ifdef XCP +#include "commands/sequence.h" +#include "pgxc/nodemgr.h" +#include "pgxc/squeue.h" +#include "utils/snapmgr.h" +#include "parser/parse_utilcmd.h" +#endif #include "postmaster/autovacuum.h" #include "postmaster/bgwriter.h" #include "postmaster/postmaster.h" @@ -201,8 +212,10 @@ static bool check_ssl(bool *newval, void **extra, GucSource source); static bool check_stage_log_stats(bool *newval, void **extra, GucSource source); static bool check_log_stats(bool *newval, void **extra, GucSource source); #ifdef PGXC +#ifndef XCP static bool check_pgxc_maintenance_mode(bool *newval, void **extra, GucSource source); #endif +#endif static bool check_canonical_path(char **newval, void **extra, GucSource source); static bool check_timezone_abbreviations(char **newval, void **extra, GucSource source); static void assign_timezone_abbreviations(const char *newval, void *extra); @@ -229,6 +242,10 @@ static const char *show_log_file_mode(void); static char *config_enum_get_options(struct config_enum * record, const char *prefix, const char *suffix, const char *separator); +#ifdef XCP +static bool check_storm_catalog_remap_string(char **newval, + void **extra, GucSource source); +#endif /* @@ -479,6 +496,10 @@ int tcp_keepalives_idle; int tcp_keepalives_interval; int tcp_keepalives_count; +#ifdef XCP +char *storm_catalog_remap_string; +#endif + /* * These variables are all dummies that don't do anything, except in some * cases provide the value for SHOW to display. The real state is elsewhere @@ -502,6 +523,9 @@ static char *log_timezone_string; static char *timezone_abbreviations_string; static char *XactIsoLevel_string; static char *session_authorization_string; +#ifdef XCP +static char *global_session_string; +#endif static int max_function_args; static int max_index_keys; static int max_identifier_length; @@ -808,6 +832,7 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, #ifdef PGXC +#ifndef XCP { {"enable_remotejoin", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of remote join plans."), @@ -835,25 +860,21 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, +#else { - {"enable_remotesort", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of remote sort plans."), - NULL - }, - &enable_remotesort, - true, - NULL, NULL, NULL - }, - { - {"enable_remotelimit", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the planner's use of remote limit plans."), - NULL + {"loose_constraints", PGC_USERSET, COORDINATORS, + gettext_noop("Relax enforcing of constraints"), + gettext_noop("If enabled then constraints like foreign keys " + "are not enforced. It's the users responsibility " + "to maintain referential integrity at the application " + "level") }, - &enable_remotelimit, - true, + &loose_constraints, + false, NULL, NULL, NULL }, #endif +#endif { {"geqo", PGC_USERSET, QUERY_TUNING_GEQO, gettext_noop("Enables genetic query optimization."), @@ -1454,7 +1475,11 @@ static struct config_bool ConfigureNamesBool[] = }, { +#ifdef XCP + {"synchronize_seqscans", PGC_SUSET, COMPAT_OPTIONS_PREVIOUS, +#else {"synchronize_seqscans", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS, +#endif gettext_noop("Enable synchronized sequential scans."), NULL }, @@ -1516,6 +1541,7 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, #ifdef PGXC +#ifndef XCP { {"persistent_datanode_connections", PGC_BACKEND, DEVELOPER_OPTIONS, gettext_noop("Session never releases acquired connections."), @@ -1527,6 +1553,15 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, { + {"strict_statement_checking", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Forbid statements that are not safe for the cluster"), + NULL + }, + &StrictStatementChecking, + true, + NULL, NULL, NULL + }, + { {"enforce_two_phase_commit", PGC_SUSET, XC_HOUSEKEEPING_OPTIONS, gettext_noop("Enforce the use of two-phase commit on transactions that" "made use of temporary objects"), @@ -1546,6 +1581,7 @@ static struct config_bool ConfigureNamesBool[] = check_pgxc_maintenance_mode, NULL, NULL }, #endif +#endif { {"lo_compat_privileges", PGC_SUSET, COMPAT_OPTIONS_PREVIOUS, @@ -1750,7 +1786,11 @@ static struct config_int ConfigureNamesInt[] = }, { +#ifdef XCP + {"temp_buffers", PGC_SUSET, RESOURCES_MEM, +#else {"temp_buffers", PGC_USERSET, RESOURCES_MEM, +#endif gettext_noop("Sets the maximum number of temporary buffers used by each session."), NULL, GUC_UNIT_BLOCKS @@ -1800,7 +1840,11 @@ static struct config_int ConfigureNamesInt[] = }, { +#ifdef XCP + {"work_mem", PGC_SUSET, RESOURCES_MEM, +#else {"work_mem", PGC_USERSET, RESOURCES_MEM, +#endif gettext_noop("Sets the maximum memory to be used for query workspaces."), gettext_noop("This much memory can be used by each internal " "sort operation and hash table before switching to " @@ -1813,7 +1857,11 @@ static struct config_int ConfigureNamesInt[] = }, { +#ifdef XCP + {"maintenance_work_mem", PGC_SUSET, RESOURCES_MEM, +#else {"maintenance_work_mem", PGC_USERSET, RESOURCES_MEM, +#endif gettext_noop("Sets the maximum memory to be used for maintenance operations."), gettext_noop("This includes operations such as VACUUM and CREATE INDEX."), GUC_UNIT_KB @@ -2154,7 +2202,11 @@ static struct config_int ConfigureNamesInt[] = }, { +#ifdef XCP + {"commit_delay", PGC_SUSET, WAL_SETTINGS, +#else {"commit_delay", PGC_USERSET, WAL_SETTINGS, +#endif gettext_noop("Sets the delay in microseconds between transaction commit and " "flushing WAL to disk."), NULL @@ -2165,7 +2217,11 @@ static struct config_int ConfigureNamesInt[] = }, { +#ifdef XCP + {"commit_siblings", PGC_SUSET, WAL_SETTINGS, +#else {"commit_siblings", PGC_USERSET, WAL_SETTINGS, +#endif gettext_noop("Sets the minimum concurrent open transactions before performing " "commit_delay."), NULL @@ -2505,6 +2561,51 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, #ifdef PGXC +#ifdef XCP + { + {"sequence_range", PGC_USERSET, COORDINATORS | DATA_NODES, + gettext_noop("The range of values to ask from GTM for sequences. " + "If CACHE parameter is set then that overrides this."), + NULL, + }, + &SequenceRangeVal, + 1, 1, INT_MAX, + NULL, NULL, NULL + }, + + { + {"pool_conn_keepalive", PGC_SIGHUP, DATA_NODES, + gettext_noop("Close connections if they are idle in the pool for that time."), + gettext_noop("A value of -1 turns autoclose off."), + GUC_UNIT_S + }, + &PoolConnKeepAlive, + 600, -1, INT_MAX, + NULL, NULL, NULL + }, + + { + {"pool_maintenance_timeout", PGC_SIGHUP, DATA_NODES, + gettext_noop("Launch maintenance routine if pooler idle for that time."), + gettext_noop("A value of -1 turns feature off."), + GUC_UNIT_S + }, + &PoolMaintenanceTimeout, + 30, -1, INT_MAX, + NULL, NULL, NULL + }, + + { + {"max_pool_size", PGC_SIGHUP, DATA_NODES, + gettext_noop("Max pool size."), + gettext_noop("If number of active connections reaches this value, " + "other connection requests will be refused") + }, + &MaxPoolSize, + 100, 1, 65535, + NULL, NULL, NULL + }, +#else { {"min_pool_size", PGC_POSTMASTER, DATA_NODES, gettext_noop("Initial pool size."), @@ -2526,6 +2627,7 @@ static struct config_int ConfigureNamesInt[] = 100, 1, 65535, NULL, NULL, NULL }, +#endif { {"pooler_port", PGC_POSTMASTER, DATA_NODES, @@ -2568,7 +2670,37 @@ static struct config_int ConfigureNamesInt[] = 16, 2, 65535, NULL, NULL, NULL }, + +#ifdef XCP + /* + * Shared queues provide shared memory buffers to stream data from + * "producer" - process which executes subplan to "consumers" - processes + * that are forwarding data to destination data nodes. + */ + { + {"shared_queues", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("Sets the number of shared memory queues used by the distributed executor."), + NULL, + GUC_UNIT_BLOCKS + }, + &NSQueues, + 64, 16, INT_MAX, + NULL, NULL, NULL + }, + + { + {"shared_queue_size", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("Sets the amount of memory allocated for a shared memory queue."), + NULL, + GUC_UNIT_BLOCKS + }, + &SQueueSize, + 64, 16, MAX_KILOBYTES, + NULL, NULL, NULL + }, #endif +#endif /* PGXC */ + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL @@ -2640,6 +2772,28 @@ static struct config_real ConfigureNamesReal[] = NULL, NULL, NULL }, +#ifdef XCP + { + {"network_byte_cost", PGC_USERSET, QUERY_TUNING_COST, + gettext_noop("Sets the planner's estimate of the cost of " + "sending data from remote node."), + NULL + }, + &network_byte_cost, + DEFAULT_NETWORK_BYTE_COST, 0, DBL_MAX, NULL, NULL + }, + + { + {"remote_query_cost", PGC_USERSET, QUERY_TUNING_COST, + gettext_noop("Sets the planner's estimate of the cost of " + "setting up remote subquery."), + NULL + }, + &remote_query_cost, + DEFAULT_REMOTE_QUERY_COST, 0, DBL_MAX, NULL, NULL + }, +#endif + { {"geqo_selection_bias", PGC_USERSET, QUERY_TUNING_GEQO, gettext_noop("GEQO: selective pressure within the population."), @@ -2983,6 +3137,31 @@ static struct config_string ConfigureNamesString[] = check_session_authorization, assign_session_authorization, NULL }, +#ifdef XCP + { + {"global_session", PGC_USERSET, UNGROUPED, + gettext_noop("Sets the global session identifier."), + NULL, + GUC_IS_NAME | GUC_REPORT | GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_NOT_WHILE_SEC_REST + }, + &global_session_string, + "none", + check_global_session, assign_global_session, NULL + }, + + { + {"pgxc_catalog_remap", PGC_SIGHUP, XC_HOUSEKEEPING_OPTIONS, + gettext_noop("List of catalog tables/views that always need to be " + "mapped to the storm_catalog."), + NULL, + GUC_LIST_INPUT | GUC_LIST_QUOTE | GUC_SUPERUSER_ONLY + }, + &storm_catalog_remap_string, + "pg_roles, pg_shdescription, pg_database, pg_db_role_setting, pg_tablespace, pg_auth_members, pg_shdepend, pg_stat_database, pg_stat_database_conflicts, pg_stat_activity, pg_locks, pg_prepared_xacts, pg_settings, pg_user, pg_group, pg_shadow, pg_user_mappings, pg_database_size, pg_show_all_settings, pg_stat_get_activity, pg_lock_status", + check_storm_catalog_remap_string, NULL, NULL + }, +#endif + { {"log_destination", PGC_SIGHUP, LOGGING_WHERE, gettext_noop("Sets the destination for server log output."), @@ -3252,6 +3431,17 @@ static struct config_string ConfigureNamesString[] = NULL, NULL, NULL }, #endif +#ifdef XCP + { + {"parentnode", PGC_BACKEND, CONN_AUTH, + gettext_noop("Sets the name of the parent data node"), + NULL + }, + &parentPGXCNode, + NULL, + NULL, NULL, NULL + }, +#endif /* XCP */ { {"ssl_ciphers", PGC_POSTMASTER, CONN_AUTH_SECURITY, gettext_noop("Sets the list of allowed SSL ciphers."), @@ -3497,6 +3687,9 @@ static struct config_enum ConfigureNamesEnum[] = #ifdef PGXC { {"remotetype", PGC_BACKEND, CONN_AUTH, +#ifdef XCP + gettext_noop("Sets the type of Postgres-XL remote connection"), +#endif gettext_noop("Sets the type of Postgres-XC remote connection"), NULL }, @@ -5347,6 +5540,13 @@ set_config_option(const char *name, const char *value, struct config_generic *record; bool prohibitValueChange = false; bool makeDefault; +#ifdef XCP + bool send_to_nodes = false; + + /* Determine now, because source may be changed below in the function */ + if (source == PGC_S_SESSION && (IS_PGXC_DATANODE || !IsConnFromCoord())) + send_to_nodes = true; +#endif #ifdef PGXC /* @@ -6083,6 +6283,75 @@ set_config_option(const char *name, const char *value, if (changeVal && (record->flags & GUC_REPORT)) ReportGUCOption(record); +#ifdef XCP + if (send_to_nodes) + { + RemoteQuery *step; + StringInfoData poolcmd; + + initStringInfo(&poolcmd); + + /* + * We are getting parse error when sending down + * SET transaction_isolation TO read committed; + * XXX generic solution? + */ + if (value && strcmp("transaction_isolation", name) == 0) + value = quote_identifier(value); + + /* + * Quote value if it is including memory or time units + */ + if (value && (record->flags & (GUC_UNIT_MEMORY | GUC_UNIT_TIME))) + value = quote_identifier(value); + + /* + * Save new parameter value with the node manager. + * XXX here we may check: if value equals to configuration default + * just reset parameter instead. Minus one table entry, shorter SET + * command sent downn... Sounds like optimization. + */ + if (action == GUC_ACTION_LOCAL) + { + if (IsTransactionBlock()) + PGXCNodeSetParam(true, name, value); + appendStringInfo(&poolcmd, "SET LOCAL %s TO %s", name, + (value ? value : "DEFAULT")); + } + else + { + PGXCNodeSetParam(false, name, value); + appendStringInfo(&poolcmd, "SET %s TO %s", name, + (value ? value : "DEFAULT")); + } + + /* + * Send new value down to remote nodes if any is connected + * XXX here we are creatig a node and invoke a function that is trying + * to send some. That introduces some overhead, which may seem to be + * significant if application sets a bunch of parameters before doing + * anything useful - waste work for for each set statement. + * We may want to avoid that, by resetting the remote parameters and + * flagging that parameters needs to be updated before sending down next + * statement. + * On the other hand if session runs with a number of customized + * parameters and switching one, that would cause all values are resent. + * So let's go with "send immediately" approach: parameters are not set + * too often to care about overhead here. + */ + step = makeNode(RemoteQuery); + step->combine_type = COMBINE_TYPE_SAME; + step->exec_nodes = NULL; + step->sql_statement = poolcmd.data; + /* force_autocommit is actually does not start transaction on nodes */ + step->force_autocommit = true; + step->exec_type = EXEC_ON_CURRENT; + ExecRemoteUtility(step); + pfree(step); + pfree(poolcmd.data); + } +#endif + return changeVal ? 1 : -1; } @@ -6407,6 +6676,11 @@ ExecSetVariableStmt(VariableSetStmt *stmt) { ListCell *head; +#ifdef XCP + /* SET TRANSACTION assumes "local" */ + stmt->is_local = true; +#endif + foreach(head, stmt->args) { DefElem *item = (DefElem *) lfirst(head); @@ -6429,6 +6703,11 @@ ExecSetVariableStmt(VariableSetStmt *stmt) { ListCell *head; +#ifdef XCP + /* SET SESSION CHARACTERISTICS assumes "session" */ + stmt->is_local = false; +#endif + foreach(head, stmt->args) { DefElem *item = (DefElem *) lfirst(head); @@ -6568,6 +6847,7 @@ set_config_by_name(PG_FUNCTION_ARGS) #ifdef PGXC +#ifndef XCP /* * Convert this to SET statement and pass it to pooler. * If command is local and we are not in a transaction block do NOT @@ -6590,6 +6870,7 @@ set_config_by_name(PG_FUNCTION_ARGS) } #endif +#endif /* Convert return string to text */ PG_RETURN_TEXT_P(cstring_to_text(new_value)); @@ -8718,6 +8999,7 @@ check_log_stats(bool *newval, void **extra, GucSource source) } #ifdef PGXC +#ifndef XCP /* * Only a warning is printed to log. * Returning false will cause FATAL error and it will not be good. @@ -8763,6 +9045,7 @@ check_pgxc_maintenance_mode(bool *newval, void **extra, GucSource source) } } #endif +#endif static bool check_canonical_path(char **newval, void **extra, GucSource source) @@ -9060,4 +9343,73 @@ show_log_file_mode(void) return buf; } +#ifdef XCP +/* + * remove all unwanted spaces from the input, lowercase all the characters and + * also add a ',' towards the end if it does not exist. This makes calling + * strstr easier on it + */ +static bool +check_storm_catalog_remap_string(char **newval, void **extra, GucSource source) +{ + /* + * Check syntax. newval must be a comma separated list of identifiers. + * Whitespace is allowed but removed from the result. + */ + bool hasSpaceAfterToken = false; + const char *cp = *newval; + int symLen = 0; + char c; + StringInfoData buf; + + /* Default NULL is OK */ + if (cp == NULL) + return true; + + initStringInfo(&buf); + while ((c = *cp++) != '\0') + { + if (isspace((unsigned char) c)) + { + if (symLen > 0) + hasSpaceAfterToken = true; + continue; + } + + if (c == ',') + { + if (symLen > 0) /* terminate identifier */ + { + appendStringInfoChar(&buf, ','); + symLen = 0; + } + hasSpaceAfterToken = false; + continue; + } + + if (hasSpaceAfterToken) + { + /* + * Syntax error due to token following space after token + */ + pfree(buf.data); + return false; + } + /* We lower case everything */ + appendStringInfoChar(&buf, pg_tolower(c)); + symLen++; + } + + /* Append ',' at end if not present already */ + if (symLen != 0 && buf.len > 0) + appendStringInfoChar(&buf, ','); + + /* GUC wants the result malloc'd */ + free(*newval); + *newval = guc_strdup(LOG, buf.data); + + pfree(buf.data); + return true; +} +#endif #include "guc-file.c" diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 4d9121814b..9bb47f967a 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -152,6 +152,10 @@ #effective_io_concurrency = 1 # 1-1000; 0 disables prefetching +# - Shared queues - + +#shared_queues = 64 # min 16 +#shared_queue_size = 64KB # min 16KB #------------------------------------------------------------------------------ # WRITE AHEAD LOG @@ -263,6 +267,8 @@ #cpu_tuple_cost = 0.01 # same scale as above #cpu_index_tuple_cost = 0.005 # same scale as above #cpu_operator_cost = 0.0025 # same scale as above +#network_byte_cost = 0.001 # same scale as above +#remote_query_cost = 100.0 # same scale as above #effective_cache_size = 128MB # - Genetic Query Optimizer - @@ -570,10 +576,13 @@ #pooler_port = 6667 # Pool Manager TCP port # (change requires restart) -#min_pool_size = 1 # Initial pool size - # (change requires restart) #max_pool_size = 100 # Maximum pool size - # (change requires restart) +#pool_conn_keepalive = 600 # Close connections if they are idle + # in the pool for that time + # A value of -1 turns autoclose off +#pool_maintenance_timeout = 30 # Launch maintenance routine if pooler + # is idle for that time + # A value of -1 turns feature off #persistent_datanode_connections = off # Set persistent connection mode for pooler # if set at on, connections taken for session # are not put back to pool @@ -598,20 +607,14 @@ ##------------------------------------------------------------------------------ # OTHER PG-XC OPTIONS #------------------------------------------------------------------------------ +#strict_statement_checking = on # Forbid PG-XC-unsafe SQL + # Enabling is useful for development #enforce_two_phase_commit = on # Enforce the usage of two-phase commit on transactions # where temporary objects are used or ON COMMIT actions # are pending. # Usage of commit instead of two-phase commit may break # data consistency so use at your own risk. -# - Postgres-XC specific Planner Method Configuration - -#enable_fast_query_shipping = on -#enable_remotejoin = on -#enable_remotegroup = on -#enable_remotelimit = on -#enable_remotesort = on - #------------------------------------------------------------------------------ # CUSTOMIZED OPTIONS #------------------------------------------------------------------------------ diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c index 6a1858d2a5..2d28c4e2e3 100644 --- a/src/backend/utils/mmgr/portalmem.c +++ b/src/backend/utils/mmgr/portalmem.c @@ -8,6 +8,11 @@ * doesn't actually run the executor for them. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -395,6 +400,52 @@ PortalCreateHoldStore(Portal portal) MemoryContextSwitchTo(oldcxt); } +#ifdef XCP +void +PortalCreateProducerStore(Portal portal) +{ + MemoryContext oldcxt; + + Assert(portal->holdContext == NULL); + Assert(portal->holdStore == NULL); + + /* + * Create the memory context that is used for storage of the tuple set. + * Note this is NOT a child of the portal's heap memory. + */ + portal->holdContext = + AllocSetContextCreate(PortalMemory, + "PortalHoldContext", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + /* + * Create the tuple store, selecting cross-transaction temp files, and + * enabling random access only if cursor requires scrolling. + * + * XXX: Should maintenance_work_mem be used for the portal size? + */ + oldcxt = MemoryContextSwitchTo(portal->holdContext); + + portal->tmpContext = AllocSetContextCreate(portal->holdContext, + "TuplestoreTempContext", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + /* + * We really do not need interXact set to true for the producer store, + * but we have to set it as long as we store it in holdStore variable - + * portal destroys it after the resource owner invalidating internal + * temporary file if tuplestore has been ever spilled to disk + */ + portal->holdStore = tuplestore_begin_datarow(true, work_mem, + portal->tmpContext); + + MemoryContextSwitchTo(oldcxt); +} +#endif + /* * PinPortal * Protect a portal from dropping. @@ -524,6 +575,17 @@ PortalDrop(Portal portal, bool isTopCommit) /* drop cached plan reference, if any */ PortalReleaseCachedPlan(portal); +#ifdef XCP + /* + * Skip memory release if portal is still producining, means has tuples in + * local memory, and has to push them to consumers. It would loose the + * tuples if free the memory now. + * The cleanup should be completed if the portal finished producing. + */ + if (portalIsProducing(portal)) + return; +#endif + /* * Release any resources still attached to the portal. There are several * cases being covered here: @@ -843,16 +905,16 @@ AtCleanup_Portals(void) if (portal->portalPinned) portal->portalPinned = false; -#ifdef PGXC +#ifdef PGXC /* XXX This is a PostgreSQL bug (already reported on the list by * Pavan). We comment out the assertion until the bug is fixed * upstream. - */ + */ /* We had better not be calling any user-defined code here */ /* Assert(portal->cleanup == NULL); */ #endif - + /* Zap it. */ PortalDrop(portal, false); } @@ -992,6 +1054,45 @@ AtSubCleanup_Portals(SubTransactionId mySubid) } } + +#ifdef XCP +static List *producingPortals = NIL; + +List * +getProducingPortals(void) +{ + return producingPortals; +} + + +void +addProducingPortal(Portal portal) +{ + MemoryContext save_context; + + save_context = MemoryContextSwitchTo(PortalMemory); + + producingPortals = lappend(producingPortals, portal); + + MemoryContextSwitchTo(save_context); +} + + +void +removeProducingPortal(Portal portal) +{ + producingPortals = list_delete_ptr(producingPortals, portal); +} + + +bool +portalIsProducing(Portal portal) +{ + return list_member_ptr(producingPortals, portal); +} +#endif + + /* Find all available cursors */ Datum pg_cursor(PG_FUNCTION_ARGS) diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 765ac4cef7..f43ffb8a97 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -87,6 +87,11 @@ * above. Nonetheless, with large workMem we can have many tapes. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -224,8 +229,12 @@ struct Tuplesortstate MemoryContext sortcontext; /* memory context holding all sort data */ LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ #ifdef PGXC +#ifdef XCP + ResponseCombiner *combiner; /* tuple source, alternate to tapeset */ +#else RemoteQueryState *combiner; /* tuple source, alternate to tapeset */ -#endif +#endif /* XCP */ +#endif /* PGXC */ /* * These function pointers decouple the routines that must know what kind @@ -903,7 +912,11 @@ Tuplesortstate * tuplesort_begin_merge(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, +#ifdef XCP + ResponseCombiner *combiner, +#else RemoteQueryState *combiner, +#endif int workMem) { Tuplesortstate *state = tuplesort_begin_common(workMem, false); @@ -2958,23 +2971,101 @@ reversedirection_heap(Tuplesortstate *state) } #ifdef PGXC +#ifdef XCP +static unsigned int +getlen_datanode(Tuplesortstate *state, int tapenum, bool eofOK) +{ + ResponseCombiner *combiner = state->combiner; + TupleTableSlot *dstslot = combiner->ss.ps.ps_ResultTupleSlot; + TupleTableSlot *slot; + + combiner->current_conn = tapenum; + slot = FetchTuple(combiner); + if (TupIsNull(slot)) + { + if (eofOK) + return 0; + else + elog(ERROR, "unexpected end of data"); + } + + if (slot != dstslot) + ExecCopySlot(dstslot, slot); + + return 1; +} + +static void +readtup_datanode(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + TupleTableSlot *slot = state->combiner->ss.ps.ps_ResultTupleSlot; + MinimalTuple tuple; + HeapTupleData htup; + + Assert(!TupIsNull(slot)); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} +#else static unsigned int getlen_datanode(Tuplesortstate *state, int tapenum, bool eofOK) { - RemoteQueryState *combiner = state->combiner; - TupleTableSlot *temp_tts; + RemoteQueryState *combiner = state->combiner; + PGXCNodeHandle *conn = combiner->connections[tapenum]; + /* + * If connection is active (potentially has data to read) we can get node + * number from the connection. If connection is not active (we have read all + * available data rows) and if we have buffered data from that connection + * the node number is stored in combiner->tapenodes[tapenum]. + * If connection is inactive and no buffered data we have EOF condition + */ + int nid; + unsigned int len = 0; + ListCell *lc; + ListCell *prev = NULL; - if (combiner->rqs_tapedata) - elog(ERROR, "wrong state of datanode tape"); + /* May it ever happen ?! */ + if (!conn && !combiner->tapenodes) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to fetch from data node cursor"))); + + nid = conn ? PGXCNodeGetNodeId(conn->nodeoid, PGXC_NODE_DATANODE) : combiner->tapenodes[tapenum]; - combiner->rqs_tapenum = tapenum; - temp_tts = ExecProcNode((PlanState *)combiner); - if (!TupIsNull(temp_tts)) + if (nid < 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Node id %d is incorrect", nid))); + + /* + * If there are buffered rows iterate over them and get first from + * the requested tape + */ + foreach (lc, combiner->rowBuffer) { - combiner->rqs_tapedata = temp_tts; - return temp_tts->tts_dataLen; + RemoteDataRow dataRow = (RemoteDataRow) lfirst(lc); + if (dataRow->msgnode == nid) + { + combiner->currentRow = *dataRow; + combiner->rowBuffer = list_delete_cell(combiner->rowBuffer, lc, prev); + return dataRow->msglen; + } + prev = lc; } - else + + /* Nothing is found in the buffer, check for EOF */ + if (conn == NULL) { if (eofOK) return 0; @@ -2982,22 +3073,98 @@ getlen_datanode(Tuplesortstate *state, int tapenum, bool eofOK) elog(ERROR, "unexpected end of data"); } - /* Keep compiler happy */ - return 0; + /* Going to get data from connection, buffer if needed */ + if (conn->state == DN_CONNECTION_STATE_QUERY && conn->combiner != combiner) + BufferConnection(conn); + + /* Request more rows if needed */ + if (conn->state == DN_CONNECTION_STATE_IDLE) + { + Assert(combiner->cursor); + if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to fetch from data node cursor"))); + if (pgxc_node_send_sync(conn) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to fetch from data node cursor"))); + conn->state = DN_CONNECTION_STATE_QUERY; + conn->combiner = combiner; + } + /* Read data from the connection until get a row or EOF */ + for (;;) + { + switch (handle_response(conn, combiner)) + { + case RESPONSE_SUSPENDED: + /* Send Execute to request next row */ + Assert(combiner->cursor); + if (len) + return len; + if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to fetch from data node cursor"))); + if (pgxc_node_send_sync(conn) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to fetch from data node cursor"))); + conn->state = DN_CONNECTION_STATE_QUERY; + conn->combiner = combiner; + /* fallthru */ + case RESPONSE_EOF: + /* receive more data */ + if (pgxc_node_receive(1, &conn, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg(conn->error))); + break; + case RESPONSE_COMPLETE: + /* EOF encountered, close the tape and report EOF */ + if (combiner->cursor) + { + combiner->connections[tapenum] = NULL; + if (len) + return len; + } + if (eofOK) + return 0; + else + elog(ERROR, "unexpected end of data"); + break; + case RESPONSE_DATAROW: + Assert(len == 0); + if (state->combiner->cursor) + { + /* + * We fetching one row at a time when running EQP + * so read following PortalSuspended or ResponseComplete + * to leave connection clean between the calls + */ + len = state->combiner->currentRow.msglen; + break; + } + else + return state->combiner->currentRow.msglen; + default: + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected response from the data nodes"))); + } + } } static void readtup_datanode(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len) { - TupleTableSlot *slot = state->combiner->rqs_tapedata; + TupleTableSlot *slot = state->combiner->ss.ss_ScanTupleSlot; MinimalTuple tuple; HeapTupleData htup; - Assert(!TupIsNull(slot)); - if (slot->tts_dataLen != len) - elog(ERROR, "Expected a tuple with length %d but got one with length %d", - len, slot->tts_dataLen); + FetchTuple(state->combiner, slot); + /* copy the tuple into sort storage */ tuple = ExecCopySlotMinimalTuple(slot); stup->tuple = (void *) tuple; @@ -3009,10 +3176,9 @@ readtup_datanode(Tuplesortstate *state, SortTuple *stup, state->sortKeys[0].ssup_attno, state->tupDesc, &stup->isnull1); - /* Reset the buffer for next read */ - state->combiner->rqs_tapedata = NULL; } -#endif +#endif /* XCP */ +#endif /* PGXC */ /* * Routines specialized for the CLUSTER case (HeapTuple data, with diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index 8a7931b856..9f064144f0 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -43,6 +43,11 @@ * before switching to the other state or activating a different read pointer. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -72,6 +77,27 @@ typedef enum TSS_READFILE /* Reading from temp file */ } TupStoreStatus; + +#ifdef XCP +/* + * Supported tuplestore formats + */ +typedef enum +{ + TSF_MINIMAL, /* Minimal tuples */ + TSF_DATAROW, /* Datarow tuples */ + TSF_MESSAGE /* A Postgres protocol message data */ +} TupStoreFormat; + + +typedef struct +{ + int32 msglen; + char *msg; +} msg_data; +#endif + + /* * State for a single read pointer. If we are in state INMEM then all the * read pointers' "current" fields denote the read positions. In state @@ -99,6 +125,9 @@ typedef struct struct Tuplestorestate { TupStoreStatus status; /* enumerated value as shown above */ +#ifdef XCP + TupStoreFormat format; /* enumerated value as shown above */ +#endif int eflags; /* capability flags (OR of pointers' flags) */ bool backward; /* store extra length words in file? */ bool interXact; /* keep open through transactions? */ @@ -106,6 +135,9 @@ struct Tuplestorestate long availMem; /* remaining memory available, in bytes */ BufFile *myfile; /* underlying file, or NULL if none */ MemoryContext context; /* memory context for holding tuples */ +#ifdef XCP + MemoryContext tmpcxt; /* memory context for holding temporary data */ +#endif ResourceOwner resowner; /* resowner for holding temp files */ /* @@ -171,6 +203,12 @@ struct Tuplestorestate int writepos_file; /* file# (valid if READFILE state) */ off_t writepos_offset; /* offset (valid if READFILE state) */ + + char *stat_name; + long stat_read_count; + long stat_write_count; + long stat_spill_read; + long stat_spill_write; }; #define COPYTUP(state,tup) ((*(state)->copytup) (state, tup)) @@ -235,7 +273,14 @@ static unsigned int getlen(Tuplestorestate *state, bool eofOK); static void *copytup_heap(Tuplestorestate *state, void *tup); static void writetup_heap(Tuplestorestate *state, void *tup); static void *readtup_heap(Tuplestorestate *state, unsigned int len); - +#ifdef XCP +static void *copytup_datarow(Tuplestorestate *state, void *tup); +static void writetup_datarow(Tuplestorestate *state, void *tup); +static void *readtup_datarow(Tuplestorestate *state, unsigned int len); +static void *copytup_message(Tuplestorestate *state, void *tup); +static void writetup_message(Tuplestorestate *state, void *tup); +static void *readtup_message(Tuplestorestate *state, unsigned int len); +#endif /* * tuplestore_begin_xxx @@ -275,6 +320,12 @@ tuplestore_begin_common(int eflags, bool interXact, int maxKBytes) state->readptrs[0].eof_reached = false; state->readptrs[0].current = 0; + state->stat_name = NULL; + state->stat_write_count = 0; + state->stat_read_count = 0; + state->stat_spill_write = 0; + state->stat_spill_read = 0; + return state; } @@ -313,9 +364,15 @@ tuplestore_begin_heap(bool randomAccess, bool interXact, int maxKBytes) state = tuplestore_begin_common(eflags, interXact, maxKBytes); +#ifdef XCP + state->format = TSF_MINIMAL; +#endif state->copytup = copytup_heap; state->writetup = writetup_heap; state->readtup = readtup_heap; +#ifdef XCP + state->tmpcxt = NULL; +#endif return state; } @@ -436,6 +493,16 @@ tuplestore_end(Tuplestorestate *state) { int i; + if (state->stat_name) + { + elog(LOG, "Tuplestore %s did %ld writes and %ld reads, " + "it spilled to disk after %ld writes and %ld reads, " + "now deleted %d memtuples out of %d", state->stat_name, + state->stat_write_count, state->stat_read_count, + state->stat_spill_write, state->stat_spill_read, + state->memtupdeleted, state->memtupcount); + } + if (state->myfile) BufFileClose(state->myfile); if (state->memtuples) @@ -548,6 +615,10 @@ tuplestore_puttupleslot(Tuplestorestate *state, MinimalTuple tuple; MemoryContext oldcxt = MemoryContextSwitchTo(state->context); +#ifdef XCP + if (state->format == TSF_MINIMAL) + { +#endif /* * Form a MinimalTuple in working memory */ @@ -555,6 +626,20 @@ tuplestore_puttupleslot(Tuplestorestate *state, USEMEM(state, GetMemoryChunkSpace(tuple)); tuplestore_puttuple_common(state, (void *) tuple); +#ifdef XCP + } + else if (state->format == TSF_DATAROW) + { + RemoteDataRow tuple = ExecCopySlotDatarow(slot, state->tmpcxt); + USEMEM(state, GetMemoryChunkSpace(tuple)); + + tuplestore_puttuple_common(state, (void *) tuple); + } + else + { + elog(ERROR, "Unsupported datastore format"); + } +#endif MemoryContextSwitchTo(oldcxt); } @@ -568,6 +653,10 @@ tuplestore_puttuple(Tuplestorestate *state, HeapTuple tuple) { MemoryContext oldcxt = MemoryContextSwitchTo(state->context); +#ifdef XCP + Assert(state->format == TSF_MINIMAL); +#endif + /* * Copy the tuple. (Must do this even in WRITEFILE case. Note that * COPYTUP includes USEMEM, so we needn't do that here.) @@ -590,6 +679,10 @@ tuplestore_putvalues(Tuplestorestate *state, TupleDesc tdesc, MinimalTuple tuple; MemoryContext oldcxt = MemoryContextSwitchTo(state->context); +#ifdef XCP + Assert(state->format == TSF_MINIMAL); +#endif + tuple = heap_form_minimal_tuple(tdesc, values, isnull); USEMEM(state, GetMemoryChunkSpace(tuple)); @@ -605,6 +698,9 @@ tuplestore_puttuple_common(Tuplestorestate *state, void *tuple) int i; ResourceOwner oldowner; + if (state->stat_name) + state->stat_write_count++; + switch (state->status) { case TSS_INMEM: @@ -655,6 +751,12 @@ tuplestore_puttuple_common(Tuplestorestate *state, void *tuple) if (state->memtupcount < state->memtupsize && !LACKMEM(state)) return; + if (state->stat_name) + { + state->stat_spill_read = state->stat_read_count; + state->stat_spill_write = state->stat_write_count; + } + /* * Nope; time to switch to tape-based operation. Make sure that * the temp file(s) are created in suitable temp tablespaces. @@ -764,6 +866,9 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward, return NULL; if (readptr->current < state->memtupcount) { + if (state->stat_name) + state->stat_read_count++; + /* We have another tuple, so return it */ return state->memtuples[readptr->current++]; } @@ -795,6 +900,9 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward, Assert(!state->truncated); return NULL; } + if (state->stat_name) + state->stat_read_count++; + return state->memtuples[readptr->current - 1]; } break; @@ -824,6 +932,9 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward, if ((tuplen = getlen(state, true)) != 0) { tup = READTUP(state, tuplen); + if (state->stat_name && tup) + state->stat_read_count++; + return tup; } else @@ -892,6 +1003,9 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward, SEEK_CUR) != 0) elog(ERROR, "bogus tuple length in backward scan"); tup = READTUP(state, tuplen); + if (state->stat_name && tup) + state->stat_read_count++; + return tup; default: @@ -924,12 +1038,37 @@ tuplestore_gettupleslot(Tuplestorestate *state, bool forward, if (tuple) { +#ifdef XCP + if (state->format == TSF_MINIMAL) + { +#endif if (copy && !should_free) { tuple = heap_copy_minimal_tuple(tuple); should_free = true; } ExecStoreMinimalTuple(tuple, slot, should_free); +#ifdef XCP + } + else if (state->format == TSF_DATAROW) + { + RemoteDataRow datarow = (RemoteDataRow) tuple; + if (copy && !should_free) + { + RemoteDataRow dup = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + datarow->msglen); + dup->msgnode = datarow->msgnode; + dup->msglen = datarow->msglen; + memcpy(dup->msg, datarow->msg, datarow->msglen); + datarow = dup; + should_free = true; + } + ExecStoreDataRowTuple(datarow, slot, should_free); + } + else + { + elog(ERROR, "Unsupported datastore format"); + } +#endif return true; } else @@ -1311,3 +1450,218 @@ readtup_heap(Tuplestorestate *state, unsigned int len) elog(ERROR, "unexpected end of data"); return (void *) tuple; } + + +#ifdef XCP +/* + * Routines to support Datarow tuple format, used for exchange between nodes + * as well as send data to client + */ +Tuplestorestate * +tuplestore_begin_datarow(bool interXact, int maxKBytes, + MemoryContext tmpcxt) +{ + Tuplestorestate *state; + + state = tuplestore_begin_common(0, interXact, maxKBytes); + + state->format = TSF_DATAROW; + state->copytup = copytup_datarow; + state->writetup = writetup_datarow; + state->readtup = readtup_datarow; + state->tmpcxt = tmpcxt; + + return state; +} + + +/* + * Do we need this at all? + */ +static void * +copytup_datarow(Tuplestorestate *state, void *tup) +{ + Assert(false); + return NULL; +} + +static void +writetup_datarow(Tuplestorestate *state, void *tup) +{ + RemoteDataRow tuple = (RemoteDataRow) tup; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = tuple->msg; + unsigned int tupbodylen = tuple->msglen; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int) + sizeof(tuple->msgnode); + + if (BufFileWrite(state->myfile, (void *) &tuplen, + sizeof(int)) != sizeof(int)) + elog(ERROR, "write failed"); + if (BufFileWrite(state->myfile, (void *) &tuple->msgnode, + sizeof(tuple->msgnode)) != sizeof(tuple->msgnode)) + elog(ERROR, "write failed"); + if (BufFileWrite(state->myfile, (void *) tupbody, + tupbodylen) != (size_t) tupbodylen) + elog(ERROR, "write failed"); + if (state->backward) /* need trailing length word? */ + if (BufFileWrite(state->myfile, (void *) &tuplen, + sizeof(tuplen)) != sizeof(tuplen)) + elog(ERROR, "write failed"); + + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); +} + +static void * +readtup_datarow(Tuplestorestate *state, unsigned int len) +{ + RemoteDataRow tuple = (RemoteDataRow) palloc(len); + unsigned int tupbodylen = len - sizeof(int) - sizeof(tuple->msgnode); + + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* read in the tuple proper */ + tuple->msglen = tupbodylen; + if (BufFileRead(state->myfile, (void *) &tuple->msgnode, + sizeof(tuple->msgnode)) != sizeof(tuple->msgnode)) + elog(ERROR, "unexpected end of data"); + if (BufFileRead(state->myfile, (void *) tuple->msg, + tupbodylen) != (size_t) tupbodylen) + elog(ERROR, "unexpected end of data"); + if (state->backward) /* need trailing length word? */ + if (BufFileRead(state->myfile, (void *) &len, + sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of data"); + return (void *) tuple; +} + + +/* + * Routines to support storage of protocol message data + */ +Tuplestorestate * +tuplestore_begin_message(bool interXact, int maxKBytes) +{ + Tuplestorestate *state; + + state = tuplestore_begin_common(0, interXact, maxKBytes); + + state->format = TSF_MESSAGE; + state->copytup = copytup_message; + state->writetup = writetup_message; + state->readtup = readtup_message; + state->tmpcxt = NULL; + + return state; +} + + +void +tuplestore_putmessage(Tuplestorestate *state, int len, char* msg) +{ + msg_data m; + void *tuple; + MemoryContext oldcxt = MemoryContextSwitchTo(state->context); + + Assert(state->format == TSF_MESSAGE); + + m.msglen = len; + m.msg = msg; + + tuple = COPYTUP(state, &m); + tuplestore_puttuple_common(state, tuple); + + MemoryContextSwitchTo(oldcxt); +} + + +char * +tuplestore_getmessage(Tuplestorestate *state, int *len) +{ + bool should_free; + void *result; + void *tuple = tuplestore_gettuple(state, true, &should_free); + + Assert(state->format == TSF_MESSAGE); + + /* done? */ + if (!tuple) + return NULL; + + *len = *((int *) tuple); + + result = palloc(*len); + memcpy(result, ((char *) tuple) + sizeof(int), *len); + if (should_free) + pfree(tuple); + + return (char *) result; +} + + +static void * +copytup_message(Tuplestorestate *state, void *tup) +{ + msg_data *m = (msg_data *) tup; + void *tuple; + + tuple = palloc(m->msglen + sizeof(int)); + *((int *) tuple) = m->msglen; + memcpy(((char *) tuple) + sizeof(int), m->msg, m->msglen); + USEMEM(state, GetMemoryChunkSpace(tuple)); + return tuple; +} + + +static void +writetup_message(Tuplestorestate *state, void *tup) +{ + int *msglen = (int *) tup; + /* total on-disk footprint: */ + unsigned int tuplen = *msglen; + + if (BufFileWrite(state->myfile, tup, tuplen) != tuplen) + elog(ERROR, "write failed"); + if (state->backward) /* need trailing length word? */ + if (BufFileWrite(state->myfile, (void *) &tuplen, + sizeof(tuplen)) != sizeof(tuplen)) + elog(ERROR, "write failed"); + + FREEMEM(state, GetMemoryChunkSpace(tup)); + pfree(tup); +} + +static void * +readtup_message(Tuplestorestate *state, unsigned int len) +{ + void *tuple = palloc(len + sizeof(int)); + *((int *) tuple) = len; + + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* read in the tuple proper */ + if (BufFileRead(state->myfile, ((char *) tuple) + sizeof(int), + len) != (size_t) len) + elog(ERROR, "unexpected end of data"); + if (state->backward) /* need trailing length word? */ + if (BufFileRead(state->myfile, (void *) &len, + sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of data"); + return tuple; +} +#endif + + +void +tuplestore_collect_stat(Tuplestorestate *state, char *name) +{ + if (state->status != TSS_INMEM || state->memtupcount != 0) + { + elog(WARNING, "tuplestore %s is already in use, to late to get statistics", + name); + return; + } + + state->stat_name = pstrdup(name); +} diff --git a/src/backend/utils/time/combocid.c b/src/backend/utils/time/combocid.c index 5429922d3f..30182ceea6 100644 --- a/src/backend/utils/time/combocid.c +++ b/src/backend/utils/time/combocid.c @@ -30,6 +30,11 @@ * destroyed at the end of each transaction. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -270,6 +275,25 @@ GetComboCommandId(CommandId cmin, CommandId cmax) static CommandId GetRealCmin(CommandId combocid) { +#ifdef XCP + /* + * Workaround against assertion failure (or segmentation fault if + * assertions is disabled) in a secondary datanode session when trying + * to check visibility of a tuple with ComboCID. + * ComboCID is only valid in a session that did the update, that is the + * primary session. + * Ideally we should have a solution, how to share ComboCIDs + * between session just make tuples with ComboCIDs invisible to secondary + * processes. Until then, we will have visibility issues in rare cases, + * if in the same transaction: + * 1. Tuples inserted + * 2. Cursor is opened + * 3. Tuples inserted in step 1 are deleted + * + */ + if (combocid >= usedComboCids) + return FirstCommandId - 1; +#endif Assert(combocid < usedComboCids); return comboCids[combocid].cmin; } @@ -277,6 +301,19 @@ GetRealCmin(CommandId combocid) static CommandId GetRealCmax(CommandId combocid) { +#ifdef XCP + /* + * Ugly workaround against assertion failure (or segmentation fault if + * assertions is disabled) in a secondary datanode session when trying + * to check visibility of a tuple with ComboCID. + * ComboCID is only valid in a session that did the update, that is the + * primary session. Until we come up with a solution, how to share ComboCIDs + * between session just make tuples with ComboCIDs invisible to secondary + * processes. + */ + if (combocid >= usedComboCids) + return FirstCommandId; +#endif Assert(combocid < usedComboCids); return comboCids[combocid].cmax; } diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 2899b94142..c55b947833 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -27,6 +27,11 @@ * for too long.) * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -205,7 +210,11 @@ GetTransactionSnapshot(void) * The command id should therefore be updated in the * current snapshot. */ +#ifdef XCP + if (IsConnFromCoord() || IsConnFromDatanode()) +#else if (IsConnFromCoord()) +#endif SnapshotSetCommandId(GetCurrentCommandId(false)); #endif return CurrentSnapshot; diff --git a/src/bin/Makefile b/src/bin/Makefile index b02c3caca6..7498395022 100644 --- a/src/bin/Makefile +++ b/src/bin/Makefile @@ -13,7 +13,7 @@ subdir = src/bin top_builddir = ../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = gtm_ctl initdb initgtm pg_ctl pg_dump \ +SUBDIRS = initdb initgtm pg_ctl pg_dump \ psql scripts pg_config pg_controldata pg_resetxlog pg_basebackup ifeq ($(PORTNAME), win32) diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 8ff3a0036c..75c58f760a 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -38,6 +38,11 @@ * * This code is released under the terms of the PostgreSQL License. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -136,6 +141,9 @@ static char *conf_file; static char *conversion_file; static char *dictionary_file; static char *info_schema_file; +#ifdef XCP +static char *storm_cat_file; +#endif static char *features_file; static char *system_views_file; static bool made_new_pgdata = false; @@ -216,6 +224,9 @@ static void setup_dictionary(void); static void setup_privileges(void); static void set_info_version(void); static void setup_schema(void); +#ifdef XCP +static void setup_storm(void); +#endif static void load_plpgsql(void); static void vacuum_db(void); static void make_template0(void); @@ -1591,7 +1602,11 @@ setup_description(void) PG_CMD_PRINTF1("COPY tmp_pg_shdescription FROM E'%s';\n", escape_quotes(shdesc_file)); +#ifdef XCP + PG_CMD_PUTS("INSERT INTO pg_catalog.pg_shdescription " +#else PG_CMD_PUTS("INSERT INTO pg_shdescription " +#endif " SELECT t.objoid, c.oid, t.description " " FROM tmp_pg_shdescription t, pg_class c " " WHERE c.relname = t.classname;\n"); @@ -1890,6 +1905,9 @@ setup_privileges(void) " WHERE relkind IN ('r', 'v', 'S') AND relacl IS NULL;\n", "GRANT USAGE ON SCHEMA pg_catalog TO PUBLIC;\n", "GRANT CREATE, USAGE ON SCHEMA public TO PUBLIC;\n", +#ifdef XCP + "GRANT USAGE ON SCHEMA storm_catalog TO PUBLIC;\n", +#endif "REVOKE ALL ON pg_largeobject FROM PUBLIC;\n", NULL }; @@ -2000,6 +2018,46 @@ setup_schema(void) check_ok(); } +#ifdef XCP +/* + * load storm catalog and populate from features file + */ +static void +setup_storm(void) +{ + PG_CMD_DECL; + char **line; + char **lines; + + fputs(_("creating storm catalog... "), stdout); + fflush(stdout); + + lines = readfile(storm_cat_file); + + /* + * We use -j here to avoid backslashing stuff in storm_catalog.sql + */ + snprintf(cmd, sizeof(cmd), + "\"%s\" %s -j template1 >%s", + backend_exec, backend_options, + DEVNULL); + + PG_CMD_OPEN; + + for (line = lines; *line != NULL; line++) + { + PG_CMD_PUTS(*line); + free(*line); + } + + free(lines); + + PG_CMD_CLOSE; + + check_ok(); +} +#endif + /* * load PL/pgsql server-side language */ @@ -2090,7 +2148,11 @@ make_template0(void) const char **line; static const char *template0_setup[] = { "CREATE DATABASE template0;\n", +#ifdef XCP + "UPDATE pg_catalog.pg_database SET " +#else "UPDATE pg_database SET " +#endif " datistemplate = 't', " " datallowconn = 'f' " " WHERE datname = 'template0';\n", @@ -2098,8 +2160,13 @@ make_template0(void) /* * We use the OID of template0 to determine lastsysoid */ +#ifdef XCP + "UPDATE pg_catalog.pg_database SET datlastsysoid = " + " (SELECT oid FROM pg_catalog.pg_database " +#else "UPDATE pg_database SET datlastsysoid = " " (SELECT oid FROM pg_database " +#endif " WHERE datname = 'template0');\n", /* @@ -2115,7 +2182,11 @@ make_template0(void) /* * Finally vacuum to clean up dead rows in pg_database */ +#ifdef XCP + "VACUUM FULL pg_catalog.pg_database;\n", +#else "VACUUM FULL pg_database;\n", +#endif NULL }; @@ -2593,8 +2664,12 @@ usage(const char *progname) printf(_(" --auth-local=METHOD default authentication method for local-socket connections\n")); printf(_(" [-D, --pgdata=]DATADIR location for this database cluster\n")); #ifdef PGXC +#ifdef XCP + printf(_(" --nodename=NODENAME name of Postgres-XL node initialized\n")); +#else printf(_(" --nodename=NODENAME name of Postgres-XC node initialized\n")); #endif +#endif printf(_(" -E, --encoding=ENCODING set default encoding for new databases\n")); printf(_(" --locale=LOCALE set default locale for new databases\n")); printf(_(" --lc-collate=, --lc-ctype=, --lc-messages=LOCALE\n" @@ -2877,7 +2952,11 @@ main(int argc, char *argv[]) #ifdef PGXC if (!nodename) { +#ifdef XCP + fprintf(stderr, _("%s: Postgres-XL node name is mandatory\n"), progname); +#else fprintf(stderr, _("%s: Postgres-XC node name is mandatory\n"), progname); +#endif fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); exit(1); @@ -3025,6 +3104,9 @@ main(int argc, char *argv[]) set_input(&conversion_file, "conversion_create.sql"); set_input(&dictionary_file, "snowball_create.sql"); set_input(&info_schema_file, "information_schema.sql"); +#ifdef XCP + set_input(&storm_cat_file, "storm_catalog.sql"); +#endif set_input(&features_file, "sql_features.txt"); set_input(&system_views_file, "system_views.sql"); @@ -3058,6 +3140,9 @@ main(int argc, char *argv[]) check_input(conversion_file); check_input(dictionary_file); check_input(info_schema_file); +#ifdef XCP + check_input(storm_cat_file); +#endif check_input(features_file); check_input(system_views_file); @@ -3402,6 +3487,10 @@ main(int argc, char *argv[]) load_plpgsql(); +#ifdef XCP + setup_storm(); +#endif + vacuum_db(); make_template0(); @@ -3423,11 +3512,19 @@ main(int argc, char *argv[]) #ifdef PGXC +#ifdef XCP + printf(_("\nSuccess.\n You can now start the database server of the Postgres-XL coordinator using:\n\n" +#else printf(_("\nSuccess.\n You can now start the database server of the Postgres-XC coordinator using:\n\n" +#endif " %s%s%spostgres%s --coordinator -D %s%s%s\n" "or\n" " %s%s%spg_ctl%s start -D %s%s%s -Z coordinator -l logfile\n\n" +#ifdef XCP + " You can now start the database server of the Postgres-XL datanode using:\n\n" +#else " You can now start the database server of the Postgres-XC datanode using:\n\n" +#endif " %s%s%spostgres%s --datanode -D %s%s%s\n" "or \n" " %s%s%spg_ctl%s start -D %s%s%s -Z datanode -l logfile\n\n"), diff --git a/src/bin/initgtm/initgtm.c b/src/bin/initgtm/initgtm.c index 57856e0f2e..d779fff6b9 100644 --- a/src/bin/initgtm/initgtm.c +++ b/src/bin/initgtm/initgtm.c @@ -772,7 +772,11 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo) static void usage(const char *progname) { +#ifdef XCP + printf(_("%s initializes GTM for a Postgres-XL database cluster.\n\n"), progname); +#else printf(_("%s initializes a GTM for Postgres-XC database cluster.\n\n"), progname); +#endif printf(_("Usage:\n")); printf(_(" %s [NODE-TYPE] [OPTION]... [DATADIR]\n"), progname); printf(_("\nOptions:\n")); @@ -823,7 +827,11 @@ main(int argc, char *argv[]) } if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) { +#ifdef XCP + puts("initgtm (Postgres-XL) " PGXC_VERSION); +#else puts("initgtm (Postgres-XC) " PGXC_VERSION); +#endif exit(0); } } diff --git a/src/bin/pg_basebackup/streamutil.c b/src/bin/pg_basebackup/streamutil.c index 1b4a9d240b..e5b3ee06c2 100644 --- a/src/bin/pg_basebackup/streamutil.c +++ b/src/bin/pg_basebackup/streamutil.c @@ -154,7 +154,7 @@ GetConnection(void) if (PQstatus(tmpconn) != CONNECTION_OK) { - fprintf(stderr, _("%s: could not connect to server: %s\n"), + fprintf(stderr, _("%s: could not connect to server: %s"), progname, PQerrorMessage(tmpconn)); return NULL; } diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index 0eb8084053..cf3d0e1d28 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -61,8 +61,8 @@ typedef enum NO_COMMAND = 0, INIT_COMMAND, START_COMMAND, - RESTART_COMMAND, STOP_COMMAND, + RESTART_COMMAND, RELOAD_COMMAND, STATUS_COMMAND, PROMOTE_COMMAND, @@ -1791,11 +1791,15 @@ do_help(void) printf(_(" -t, --timeout=SECS seconds to wait when using -w option\n")); printf(_(" -w wait until operation completes\n")); printf(_(" -W do not wait until operation completes\n")); + printf(_(" --help show this help, then exit\n")); + printf(_(" --version output version information, then exit\n")); #ifdef PGXC +#ifdef XCP + printf(_(" -Z NODE-TYPE can be \"coordinator\" or \"datanode\" (Postgres-XL)\n")); +#else printf(_(" -Z NODE-TYPE can be \"coordinator\" or \"datanode\" (Postgres-XC)\n")); #endif - printf(_(" --help show this help, then exit\n")); - printf(_(" --version output version information, then exit\n")); +#endif printf(_("(The default is to wait for shutdown, but not for start or restart.)\n\n")); printf(_("If the -D option is omitted, the environment variable PGDATA is used.\n")); @@ -2110,6 +2114,8 @@ main(int argc, char **argv) pgxcCommand = strdup("--coordinator"); else if (strcmp(optarg, "datanode") == 0) pgxcCommand = strdup("--datanode"); + else if (strcmp(optarg, "restoremode") == 0) + pgxcCommand = strdup("--restoremode"); #endif case 's': silent_mode = true; @@ -2298,12 +2304,12 @@ main(int argc, char **argv) case START_COMMAND: do_start(); break; - case RESTART_COMMAND: - do_restart(); - break; case STOP_COMMAND: do_stop(); break; + case RESTART_COMMAND: + do_restart(); + break; case RELOAD_COMMAND: do_reload(); break; diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 539bcb9167..0637563a31 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -4,6 +4,11 @@ * pg_dump is a utility for dumping out a postgres database * into a script file. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -137,7 +142,9 @@ static int column_inserts = 0; static int no_security_labels = 0; static int no_unlogged_table_data = 0; static int serializable_deferrable = 0; - +#ifdef PGXC +static int include_nodes = 0; +#endif static void help(const char *progname); static void setup_connection(Archive *AH, const char *dumpencoding, @@ -190,6 +197,7 @@ static void dumpTable(Archive *fout, TableInfo *tbinfo); static void dumpTableSchema(Archive *fout, TableInfo *tbinfo); static void dumpAttrDef(Archive *fout, AttrDefInfo *adinfo); static void dumpSequence(Archive *fout, TableInfo *tbinfo); +static void dumpSequenceData(Archive *fout, TableDataInfo *tdinfo); static void dumpIndex(Archive *fout, IndxInfo *indxinfo); static void dumpConstraint(Archive *fout, ConstraintInfo *coninfo); static void dumpTableConstraintComment(Archive *fout, ConstraintInfo *coninfo); @@ -340,6 +348,9 @@ main(int argc, char **argv) {"use-set-session-authorization", no_argument, &use_setsessauth, 1}, {"no-security-labels", no_argument, &no_security_labels, 1}, {"no-unlogged-table-data", no_argument, &no_unlogged_table_data, 1}, +#ifdef PGXC + {"include-nodes", no_argument, &include_nodes, 1}, +#endif {NULL, 0, NULL, 0} }; @@ -816,6 +827,9 @@ help(const char *progname) printf(_(" --use-set-session-authorization\n" " use SET SESSION AUTHORIZATION commands instead of\n" " ALTER OWNER commands to set ownership\n")); +#ifdef PGXC + printf(_(" --include-nodes include TO NODE clause in the dumped CREATE TABLE commands\n")); +#endif printf(_("\nConnection options:\n")); printf(_(" -h, --host=HOSTNAME database server host or socket directory\n")); @@ -1049,6 +1063,9 @@ selectDumpableNamespace(NamespaceInfo *nsinfo) nsinfo->dobj.dump = simple_oid_list_member(&schema_include_oids, nsinfo->dobj.catId.oid); else if (strncmp(nsinfo->dobj.name, "pg_", 3) == 0 || +#ifdef XCP + strncmp(nsinfo->dobj.name, "storm_", 6) == 0 || +#endif strcmp(nsinfo->dobj.name, "information_schema") == 0) nsinfo->dobj.dump = false; else @@ -1808,6 +1825,23 @@ dumpDatabase(Archive *fout) selectSourceSchema(fout, "pg_catalog"); /* Get the database owner and parameters from pg_database */ +#ifdef XCP + if (fout->remoteVersion >= 90100) + { + appendPQExpBuffer(dbQry, "SELECT 1262::oid as tableoid, oid, " + "(%s datdba) AS dba, " + "pg_encoding_to_char(encoding) AS encoding, " + "datcollate, datctype, datfrozenxid, " + "(SELECT spcname FROM pg_tablespace t WHERE t.oid = dattablespace) AS tablespace, " + "shobj_description(oid, 'pg_database') AS description " + + "FROM pg_database " + "WHERE datname = ", + username_subquery); + appendStringLiteralAH(dbQry, datname, fout); + } + else +#endif if (fout->remoteVersion >= 80400) { appendPQExpBuffer(dbQry, "SELECT tableoid, oid, " @@ -3832,6 +3866,7 @@ getTables(Archive *fout, int *numTables) #ifdef PGXC int i_pgxclocatortype; int i_pgxcattnum; + int i_pgxc_node_names; #endif int i_reltablespace; int i_reloptions; @@ -3883,6 +3918,7 @@ getTables(Archive *fout, int *numTables) #ifdef PGXC "(SELECT pclocatortype from pgxc_class v where v.pcrelid = c.oid) AS pgxclocatortype," "(SELECT pcattnum from pgxc_class v where v.pcrelid = c.oid) AS pgxcattnum," + "(SELECT string_agg(node_name,',') AS pgxc_node_names from pgxc_node n where n.oid in (select unnest(nodeoids) from pgxc_class v where v.pcrelid=c.oid) ) , " #endif "array_to_string(c.reloptions, ', ') AS reloptions, " "array_to_string(array(SELECT 'toast.' || x FROM unnest(tc.reloptions) x), ', ') AS toast_reloptions " @@ -3941,8 +3977,6 @@ getTables(Archive *fout, int *numTables) /* * Left join to pick up dependency info linking sequences to their * owning column, if any (note this dependency is AUTO as of 8.2) - * PGXC is based on PostgreSQL version 8.4, it is not necessary to - * to modify the other SQL queries. */ appendPQExpBuffer(query, "SELECT c.tableoid, c.oid, c.relname, " @@ -3957,7 +3991,7 @@ getTables(Archive *fout, int *numTables) "d.refobjid AS owning_tab, " "d.refobjsubid AS owning_col, " "(SELECT spcname FROM pg_tablespace t WHERE t.oid = c.reltablespace) AS reltablespace, " - "array_to_string(c.reloptions, ', ') AS reloptions, " + "array_to_string(c.reloptions, ', ') AS reloptions, " "array_to_string(array(SELECT 'toast.' || x FROM unnest(tc.reloptions) x), ', ') AS toast_reloptions " "FROM pg_class c " "LEFT JOIN pg_depend d ON " @@ -4204,6 +4238,7 @@ getTables(Archive *fout, int *numTables) #ifdef PGXC i_pgxclocatortype = PQfnumber(res, "pgxclocatortype"); i_pgxcattnum = PQfnumber(res, "pgxcattnum"); + i_pgxc_node_names = PQfnumber(res, "pgxc_node_names"); #endif i_reltablespace = PQfnumber(res, "reltablespace"); i_reloptions = PQfnumber(res, "reloptions"); @@ -4274,6 +4309,7 @@ getTables(Archive *fout, int *numTables) tblinfo[i].pgxclocatortype = *(PQgetvalue(res, i, i_pgxclocatortype)); tblinfo[i].pgxcattnum = atoi(PQgetvalue(res, i, i_pgxcattnum)); } + tblinfo[i].pgxc_node_names = pg_strdup(PQgetvalue(res, i, i_pgxc_node_names)); #endif tblinfo[i].reltablespace = pg_strdup(PQgetvalue(res, i, i_reltablespace)); tblinfo[i].reloptions = pg_strdup(PQgetvalue(res, i, i_reloptions)); @@ -7174,7 +7210,10 @@ dumpDumpableObject(Archive *fout, DumpableObject *dobj) dumpCast(fout, (CastInfo *) dobj); break; case DO_TABLE_DATA: - dumpTableData(fout, (TableDataInfo *) dobj); + if (((TableDataInfo *) dobj)->tdtable->relkind == RELKIND_SEQUENCE) + dumpSequenceData(fout, (TableDataInfo *) dobj); + else + dumpTableData(fout, (TableDataInfo *) dobj); break; case DO_DUMMY_TYPE: /* table rowtypes and array types are never dumped separately */ @@ -12489,6 +12528,12 @@ dumpTableSchema(Archive *fout, TableInfo *tbinfo) fmtId(tbinfo->attnames[hashkey - 1])); } } + if (include_nodes && + tbinfo->pgxc_node_names != NULL && + tbinfo->pgxc_node_names[0] != '\0') + { + appendPQExpBuffer(q, "\nTO NODE (%s)", tbinfo->pgxc_node_names); + } #endif /* Dump generic options if any */ if (ftoptions && ftoptions[0]) @@ -13446,34 +13491,6 @@ dumpSequence(Archive *fout, TableInfo *tbinfo) if (!schemaOnly) { -#ifdef PGXC - /* - * In Postgres-XC it is possible that the current value of a - * sequence cached on each node is different as several sessions - * might use the sequence on different nodes. So what we do here - * to get a consistent dump is to get the next value of sequence. - * This insures that sequence value is unique as nextval is directly - * obtained from GTM. - */ - resetPQExpBuffer(query); - appendPQExpBuffer(query, "SELECT pg_catalog.nextval("); - appendStringLiteralAH(query, fmtId(tbinfo->dobj.name), fout); - appendPQExpBuffer(query, ");\n"); - res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK); - - if (PQntuples(res) != 1) - { - write_msg(NULL, ngettext("query to get nextval of sequence \"%s\" " - "returned %d rows (expected 1)\n", - "query to get nextval of sequence \"%s\" " - "returned %d rows (expected 1)\n", - PQntuples(res)), - tbinfo->dobj.name, PQntuples(res)); - exit_nicely(1); - } - - last = PQgetvalue(res, 0, 0); -#endif resetPQExpBuffer(query); appendPQExpBuffer(query, "SELECT pg_catalog.setval("); appendStringLiteralAH(query, fmtId(tbinfo->dobj.name), fout); @@ -13498,6 +13515,88 @@ dumpSequence(Archive *fout, TableInfo *tbinfo) destroyPQExpBuffer(labelq); } +/* + * dumpSequenceData + * write the data of one user-defined sequence + */ +static void +dumpSequenceData(Archive *fout, TableDataInfo *tdinfo) +{ + TableInfo *tbinfo = tdinfo->tdtable; + PGresult *res; + char *last; + bool called; + PQExpBuffer query = createPQExpBuffer(); + + /* Make sure we are in proper schema */ + selectSourceSchema(fout, tbinfo->dobj.namespace->dobj.name); + + appendPQExpBuffer(query, + "SELECT last_value, is_called FROM %s", + fmtId(tbinfo->dobj.name)); + + res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK); + + if (PQntuples(res) != 1) + { + write_msg(NULL, ngettext("query to get data of sequence \"%s\" returned %d row (expected 1)\n", + "query to get data of sequence \"%s\" returned %d rows (expected 1)\n", + PQntuples(res)), + tbinfo->dobj.name, PQntuples(res)); + exit_nicely(1); + } + + last = PQgetvalue(res, 0, 0); + called = (strcmp(PQgetvalue(res, 0, 1), "t") == 0); +#ifdef PGXC + /* + * In Postgres-XC it is possible that the current value of a + * sequence cached on each node is different as several sessions + * might use the sequence on different nodes. So what we do here + * to get a consistent dump is to get the next value of sequence. + * This insures that sequence value is unique as nextval is directly + * obtained from GTM. + */ + resetPQExpBuffer(query); + appendPQExpBuffer(query, "SELECT pg_catalog.nextval("); + appendStringLiteralAH(query, fmtId(tbinfo->dobj.name), fout); + appendPQExpBuffer(query, ");\n"); + res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK); + + if (PQntuples(res) != 1) + { + write_msg(NULL, ngettext("query to get nextval of sequence \"%s\" " + "returned %d rows (expected 1)\n", + "query to get nextval of sequence \"%s\" " + "returned %d rows (expected 1)\n", + PQntuples(res)), + tbinfo->dobj.name, PQntuples(res)); + exit_nicely(1); + } + + last = PQgetvalue(res, 0, 0); +#endif + resetPQExpBuffer(query); + appendPQExpBuffer(query, "SELECT pg_catalog.setval("); + appendStringLiteralAH(query, fmtId(tbinfo->dobj.name), fout); + appendPQExpBuffer(query, ", %s, %s);\n", + last, (called ? "true" : "false")); + + ArchiveEntry(fout, nilCatalogId, createDumpId(), + tbinfo->dobj.name, + tbinfo->dobj.namespace->dobj.name, + NULL, + tbinfo->rolname, + false, "SEQUENCE SET", SECTION_DATA, + query->data, "", NULL, + &(tbinfo->dobj.dumpId), 1, + NULL, NULL); + + PQclear(res); + + destroyPQExpBuffer(query); +} + static void dumpTrigger(Archive *fout, TriggerInfo *tginfo) { diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index e52fb8319b..b48a32d12a 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -263,6 +263,7 @@ typedef struct _tableInfo /* PGXC table locator Data */ char pgxclocatortype; /* Type of PGXC table locator */ int pgxcattnum; /* Number of the attribute the table is partitioned with */ + char *pgxc_node_names; /* List of node names where this table is distributed */ #endif /* * These fields are computed only if we decide the table is interesting diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c index 053e5fd36a..c90de1b18d 100644 --- a/src/bin/pg_dump/pg_dumpall.c +++ b/src/bin/pg_dump/pg_dumpall.c @@ -59,6 +59,11 @@ static PGconn *connectDatabase(const char *dbname, const char *pghost, const cha static PGresult *executeQuery(PGconn *conn, const char *query); static void executeCommand(PGconn *conn, const char *query); +#ifdef PGXC +static void dumpNodes(PGconn *conn); +static void dumpNodeGroups(PGconn *conn); +#endif /* PGXC */ + static char pg_dump_bin[MAXPGPATH]; static PQExpBuffer pgdumpopts; static bool skip_acls = false; @@ -78,6 +83,10 @@ static int server_version; static FILE *OPF; static char *filename = NULL; +#ifdef PGXC +static int dump_nodes = 0; +static int include_nodes = 0; +#endif /* PGXC */ int main(int argc, char *argv[]) @@ -138,7 +147,10 @@ main(int argc, char *argv[]) {"use-set-session-authorization", no_argument, &use_setsessauth, 1}, {"no-security-labels", no_argument, &no_security_labels, 1}, {"no-unlogged-table-data", no_argument, &no_unlogged_table_data, 1}, - +#ifdef PGXC + {"dump-nodes", no_argument, &dump_nodes, 1}, + {"include-nodes", no_argument, &include_nodes, 1}, +#endif {NULL, 0, NULL, 0} }; @@ -360,6 +372,11 @@ main(int argc, char *argv[]) if (no_unlogged_table_data) appendPQExpBuffer(pgdumpopts, " --no-unlogged-table-data"); +#ifdef PGXC + if (include_nodes) + appendPQExpBuffer(pgdumpopts, " --include-nodes"); +#endif + /* * If there was a database specified on the command line, use that, * otherwise try to connect to database "postgres", and failing that @@ -511,6 +528,15 @@ main(int argc, char *argv[]) if (server_version >= 90000) dumpDbRoleConfig(conn); } + +#ifdef PGXC + /* Dump nodes and node groups */ + if (dump_nodes) + { + dumpNodes(conn); + dumpNodeGroups(conn); + } +#endif } if (!globals_only && !roles_only && !tablespaces_only) @@ -564,6 +590,10 @@ help(void) printf(_(" --use-set-session-authorization\n" " use SET SESSION AUTHORIZATION commands instead of\n" " ALTER OWNER commands to set ownership\n")); +#ifdef PGXC + printf(_(" --dump-nodes include nodes and node groups in the dump\n")); + printf(_(" --include-nodes include TO NODE clause in the dumped CREATE TABLE commands\n")); +#endif printf(_("\nConnection options:\n")); printf(_(" -h, --host=HOSTNAME database server host or socket directory\n")); @@ -1918,3 +1948,76 @@ doShellQuoting(PQExpBuffer buf, const char *str) appendPQExpBufferChar(buf, '"'); #endif /* WIN32 */ } + +#ifdef PGXC +static void +dumpNodes(PGconn *conn) +{ + PQExpBuffer query; + PGresult *res; + int num; + int i; + + query = createPQExpBuffer(); + + appendPQExpBuffer(query, "select 'CREATE NODE ' || node_name || '" + " WITH (TYPE = ' || chr(39) || (case when node_type='C'" + " then 'coordinator' else 'datanode' end) || chr(39)" + " || ' , HOST = ' || chr(39) || node_host || chr(39)" + " || ', PORT = ' || node_port || (case when nodeis_primary='t'" + " then ', PRIMARY' else ' ' end) || (case when nodeis_preferred" + " then ', PREFERRED' else ' ' end) || ');' " + " as node_query from pg_catalog.pgxc_node order by oid"); + + res = executeQuery(conn, query->data); + + num = PQntuples(res); + + if (num > 0) + fprintf(OPF, "--\n-- Nodes\n--\n\n"); + + for (i = 0; i < num; i++) + { + fprintf(OPF, "%s\n", PQgetvalue(res, i, PQfnumber(res, "node_query"))); + } + fprintf(OPF, "\n"); + + PQclear(res); + destroyPQExpBuffer(query); +} + +static void +dumpNodeGroups(PGconn *conn) +{ + PQExpBuffer query; + PGresult *res; + int num; + int i; + + query = createPQExpBuffer(); + + appendPQExpBuffer(query, + "select 'CREATE NODE GROUP ' || pgxc_group.group_name" + " || ' WITH(' || string_agg(node_name,',') || ');'" + " as group_query from pg_catalog.pgxc_node, pg_catalog.pgxc_group" + " where pgxc_node.oid = any (pgxc_group.group_members)" + " group by pgxc_group.group_name" + " order by pgxc_group.group_name"); + + res = executeQuery(conn, query->data); + + num = PQntuples(res); + + if (num > 0) + fprintf(OPF, "--\n-- Node groups\n--\n\n"); + + for (i = 0; i < num; i++) + { + fprintf(OPF, "%s\n", PQgetvalue(res, i, PQfnumber(res, "group_query"))); + } + fprintf(OPF, "\n"); + + PQclear(res); + destroyPQExpBuffer(query); +} +#endif diff --git a/src/bin/pg_resetxlog/po/sv.po b/src/bin/pg_resetxlog/po/sv.po deleted file mode 100644 index 16e6e051c8..0000000000 --- a/src/bin/pg_resetxlog/po/sv.po +++ /dev/null @@ -1,463 +0,0 @@ -# Swedish message translation file for resetxlog. -# Dennis Bj�rklund <[email protected]>, 2002, 2003, 2004, 2005, 2006. -# Peter Eisentraut <[email protected]>, 2010. -# -msgid "" -msgstr "" -"Project-Id-Version: PostgreSQL 9.0\n" -"Report-Msgid-Bugs-To: [email protected]\n" -"POT-Creation-Date: 2010-07-02 05:22+0000\n" -"PO-Revision-Date: 2010-07-02 20:32-0400\n" -"Last-Translator: Peter Eisentraut <[email protected]>\n" -"Language-Team: Swedish <[email protected]>\n" -"MIME-Version: 1.0\n" -"Content-Type: text/plain; charset=ISO-8859-1\n" -"Content-Transfer-Encoding: 8bit\n" - -#: pg_resetxlog.c:135 -#, c-format -msgid "%s: invalid argument for option -e\n" -msgstr "%s: felaktigt argument till flagga -e\n" - -#: pg_resetxlog.c:136 pg_resetxlog.c:151 pg_resetxlog.c:166 pg_resetxlog.c:181 -#: pg_resetxlog.c:196 pg_resetxlog.c:211 pg_resetxlog.c:218 pg_resetxlog.c:225 -#: pg_resetxlog.c:231 pg_resetxlog.c:239 -#, c-format -msgid "Try \"%s --help\" for more information.\n" -msgstr "F�rs�k med \"%s --help\" f�r mer information.\n" - -#: pg_resetxlog.c:141 -#, c-format -msgid "%s: transaction ID epoch (-e) must not be -1\n" -msgstr "%s: transaktions-ID epoch (-e) f�r inte vara -1\n" - -#: pg_resetxlog.c:150 -#, c-format -msgid "%s: invalid argument for option -x\n" -msgstr "%s: ogiltigt argument till flaggan -x\n" - -#: pg_resetxlog.c:156 -#, c-format -msgid "%s: transaction ID (-x) must not be 0\n" -msgstr "%s: transaktions-ID (-x) f�r inte vara 0\n" - -#: pg_resetxlog.c:165 -#, c-format -msgid "%s: invalid argument for option -o\n" -msgstr "%s: ogiltigt argument till flaggan -o\n" - -#: pg_resetxlog.c:171 -#, c-format -msgid "%s: OID (-o) must not be 0\n" -msgstr "%s: OID (-o) f�r inte vara 0\n" - -#: pg_resetxlog.c:180 -#, c-format -msgid "%s: invalid argument for option -m\n" -msgstr "%s: ogiltigt argument till flaggan -m\n" - -#: pg_resetxlog.c:186 -#, c-format -msgid "%s: multitransaction ID (-m) must not be 0\n" -msgstr "%s: multitransaktions-ID (-m) f�r inte vara 0\n" - -#: pg_resetxlog.c:195 -#, c-format -msgid "%s: invalid argument for option -O\n" -msgstr "%s: ogiltigt argument till flaggan -O\n" - -#: pg_resetxlog.c:201 -#, c-format -msgid "%s: multitransaction offset (-O) must not be -1\n" -msgstr "%s: multitransaktionsoffset (-O) f�r inte vara -1\n" - -#: pg_resetxlog.c:210 pg_resetxlog.c:217 pg_resetxlog.c:224 -#, c-format -msgid "%s: invalid argument for option -l\n" -msgstr "%s: ogiltigt argument till flaggan -l\n" - -#: pg_resetxlog.c:238 -#, c-format -msgid "%s: no data directory specified\n" -msgstr "%s: ingen datakatalog angiven\n" - -#: pg_resetxlog.c:252 -#, c-format -msgid "%s: cannot be executed by \"root\"\n" -msgstr "%s: kan inte exekveras av \"root\"\n" - -#: pg_resetxlog.c:254 -#, c-format -msgid "You must run %s as the PostgreSQL superuser.\n" -msgstr "Du m�ste k�ra %s som PostgreSQLs superanv�ndare.\n" - -#: pg_resetxlog.c:264 -#, c-format -msgid "%s: could not change directory to \"%s\": %s\n" -msgstr "%s: kunde byta katalog till \"%s\": %s\n" - -#: pg_resetxlog.c:279 pg_resetxlog.c:407 -#, c-format -msgid "%s: could not open file \"%s\" for reading: %s\n" -msgstr "%s: kunde inte �ppna fil \"%s\" f�r l�sning: %s\n" - -#: pg_resetxlog.c:285 -#, c-format -msgid "" -"%s: lock file \"%s\" exists\n" -"Is a server running? If not, delete the lock file and try again.\n" -msgstr "" -"%s: l�sfil \"%s\" existerar\n" -"K�r servern redan? Om inte, radera l�sfilen och f�rs�k igen.\n" - -#: pg_resetxlog.c:355 -#, c-format -msgid "" -"\n" -"If these values seem acceptable, use -f to force reset.\n" -msgstr "" -"\n" -"Om dessa v�rden verkar acceptable, anv�nd -f f�r\n" -"att forcera �terst�llande.\n" - -#: pg_resetxlog.c:367 -#, c-format -msgid "" -"The database server was not shut down cleanly.\n" -"Resetting the transaction log might cause data to be lost.\n" -"If you want to proceed anyway, use -f to force reset.\n" -msgstr "" -"Databasservern st�ngdes inte ner korrekt. Att �terst�lla\n" -"transaktionsloggen kan medf�ra att data f�rloras.\n" -"Om du vill forts�tta �nd�, anv�nd -f f�r att forcera\n" -"�terst�llande.\n" - -#: pg_resetxlog.c:381 -#, c-format -msgid "Transaction log reset\n" -msgstr "�terst�llande fr�n transaktionslogg\n" - -#: pg_resetxlog.c:410 -#, c-format -msgid "" -"If you are sure the data directory path is correct, execute\n" -" touch %s\n" -"and try again.\n" -msgstr "" -"Om du �r s�ker p� att datakatalogs�kv�gen �r korrekt s� g�r\n" -" touch %s\n" -"och f�rs�k igen.\n" - -#: pg_resetxlog.c:423 -#, c-format -msgid "%s: could not read file \"%s\": %s\n" -msgstr "%s: kunde inte l�sa fil \"%s\": %s\n" - -#: pg_resetxlog.c:446 -#, c-format -msgid "%s: pg_control exists but has invalid CRC; proceed with caution\n" -msgstr "" -"%s: pg_control existerar men har ogiltig CRC; forts�tt med f�rsiktighet\n" - -#: pg_resetxlog.c:455 -#, c-format -msgid "%s: pg_control exists but is broken or unknown version; ignoring it\n" -msgstr "" -"%s: pg_control existerar men �r trasig eller har ok�nd version; ignorerar " -"den\n" - -#: pg_resetxlog.c:549 -#, c-format -msgid "" -"Guessed pg_control values:\n" -"\n" -msgstr "" -"Gissade pg_control-v�rden:\n" -"\n" - -#: pg_resetxlog.c:551 -#, c-format -msgid "" -"pg_control values:\n" -"\n" -msgstr "" -"pg_control-v�rden:\n" -"\n" - -#: pg_resetxlog.c:560 -#, c-format -msgid "First log file ID after reset: %u\n" -msgstr "F�rsta loggfil efter nollst�llning: %u\n" - -#: pg_resetxlog.c:562 -#, c-format -msgid "First log file segment after reset: %u\n" -msgstr "F�rsta loggfilsegment efter nollst.: %u\n" - -#: pg_resetxlog.c:564 -#, c-format -msgid "pg_control version number: %u\n" -msgstr "pg_control versionsnummer: %u\n" - -#: pg_resetxlog.c:566 -#, c-format -msgid "Catalog version number: %u\n" -msgstr "Katalogversionsnummer: %u\n" - -#: pg_resetxlog.c:568 -#, c-format -msgid "Database system identifier: %s\n" -msgstr "Databasens systemidentifierare: %s\n" - -#: pg_resetxlog.c:570 -#, c-format -msgid "Latest checkpoint's TimeLineID: %u\n" -msgstr "Senaste kontrollpunktens TimeLineID: %u\n" - -#: pg_resetxlog.c:572 -#, c-format -msgid "Latest checkpoint's NextXID: %u/%u\n" -msgstr "Senaste kontrollpunktens NextXID: %u/%u\n" - -#: pg_resetxlog.c:575 -#, c-format -msgid "Latest checkpoint's NextOID: %u\n" -msgstr "Senaste kontrollpunktens NextOID: %u\n" - -# FIXME: too wide -#: pg_resetxlog.c:577 -#, c-format -msgid "Latest checkpoint's NextMultiXactId: %u\n" -msgstr "Senaste kontrollpunktens NextMultiXactId: %u\n" - -#: pg_resetxlog.c:579 -#, c-format -msgid "Latest checkpoint's NextMultiOffset: %u\n" -msgstr "Senaste kontrollpunktens NextMultiOffset: %u\n" - -#: pg_resetxlog.c:581 -#, c-format -msgid "Latest checkpoint's oldestXID: %u\n" -msgstr "Senaste kontrollpunktens oldestXID: %u\n" - -# FIXME: too wide -#: pg_resetxlog.c:583 -#, c-format -msgid "Latest checkpoint's oldestXID's DB: %u\n" -msgstr "Senaste kontrollpunktens oldestXID:s DB: %u\n" - -# FIXME: too wide -#: pg_resetxlog.c:585 -#, c-format -msgid "Latest checkpoint's oldestActiveXID: %u\n" -msgstr "Senaste kontrollpunktens oldestActiveXID: %u\n" - -#: pg_resetxlog.c:587 -#, c-format -msgid "Maximum data alignment: %u\n" -msgstr "Maximal data-alignment: %u\n" - -#: pg_resetxlog.c:590 -#, c-format -msgid "Database block size: %u\n" -msgstr "Databasens blockstorlek: %u\n" - -#: pg_resetxlog.c:592 -#, c-format -msgid "Blocks per segment of large relation: %u\n" -msgstr "Block per segment i stor relation: %u\n" - -#: pg_resetxlog.c:594 -#, c-format -msgid "WAL block size: %u\n" -msgstr "WAL-blockstorlek: %u\n" - -#: pg_resetxlog.c:596 -#, c-format -msgid "Bytes per WAL segment: %u\n" -msgstr "Bytes per WAL-segment: %u\n" - -#: pg_resetxlog.c:598 -#, c-format -msgid "Maximum length of identifiers: %u\n" -msgstr "Maximal l�ngd p� identifierare: %u\n" - -#: pg_resetxlog.c:600 -#, c-format -msgid "Maximum columns in an index: %u\n" -msgstr "Maximalt antal kolumner i index: %u\n" - -#: pg_resetxlog.c:602 -#, c-format -msgid "Maximum size of a TOAST chunk: %u\n" -msgstr "Maximal storlek p� TOAST-bit: %u\n" - -#: pg_resetxlog.c:604 -#, c-format -msgid "Date/time type storage: %s\n" -msgstr "Lagringstyp f�r datum/tid: %s\n" - -#: pg_resetxlog.c:605 -msgid "64-bit integers" -msgstr "64-bits heltal" - -#: pg_resetxlog.c:605 -msgid "floating-point numbers" -msgstr "flyttalsnummer" - -#: pg_resetxlog.c:606 -#, fuzzy, c-format -msgid "Float4 argument passing: %s\n" -msgstr "Maximal data-alignment: %u\n" - -#: pg_resetxlog.c:607 pg_resetxlog.c:609 -msgid "by value" -msgstr "" - -#: pg_resetxlog.c:607 pg_resetxlog.c:609 -msgid "by reference" -msgstr "" - -#: pg_resetxlog.c:608 -#, fuzzy, c-format -msgid "Float8 argument passing: %s\n" -msgstr "Maximal data-alignment: %u\n" - -#: pg_resetxlog.c:671 -#, c-format -msgid "" -"%s: internal error -- sizeof(ControlFileData) is too large ... fix " -"PG_CONTROL_SIZE\n" -msgstr "" -"%s: internt fel -- sizeof(ControlFileData) �r f�r stor ... r�tt till " -"PG_CONTROL_SIZE\n" - -#: pg_resetxlog.c:686 -#, c-format -msgid "%s: could not create pg_control file: %s\n" -msgstr "%s: kunde inte skapa pg_control-fil: %s\n" - -#: pg_resetxlog.c:697 -#, c-format -msgid "%s: could not write pg_control file: %s\n" -msgstr "%s: kunde inte skriva pg_control-fil: %s\n" - -#: pg_resetxlog.c:704 pg_resetxlog.c:1011 -#, c-format -msgid "%s: fsync error: %s\n" -msgstr "%s: fsync fel: %s\n" - -#: pg_resetxlog.c:742 pg_resetxlog.c:817 pg_resetxlog.c:873 -#, c-format -msgid "%s: could not open directory \"%s\": %s\n" -msgstr "%s: kunde inte �ppna katalog \"%s\": %s\n" - -#: pg_resetxlog.c:786 pg_resetxlog.c:850 pg_resetxlog.c:907 -#, c-format -msgid "%s: could not read from directory \"%s\": %s\n" -msgstr "%s: kunde inte l�sa fr�n katalog \"%s\": %s\n" - -#: pg_resetxlog.c:831 pg_resetxlog.c:888 -#, c-format -msgid "%s: could not delete file \"%s\": %s\n" -msgstr "%s: kunde inte radera filen \"%s\": %s\n" - -#: pg_resetxlog.c:978 -#, c-format -msgid "%s: could not open file \"%s\": %s\n" -msgstr "%s: kunde inte �ppna fil \"%s\": %s\n" - -#: pg_resetxlog.c:989 pg_resetxlog.c:1003 -#, c-format -msgid "%s: could not write file \"%s\": %s\n" -msgstr "%s: kunde inte skriva fil \"%s\": %s\n" - -#: pg_resetxlog.c:1022 -#, c-format -msgid "" -"%s resets the PostgreSQL transaction log.\n" -"\n" -msgstr "" -"%s �terst�ller PostgreSQL transaktionslogg.\n" -"\n" - -#: pg_resetxlog.c:1023 -#, c-format -msgid "" -"Usage:\n" -" %s [OPTION]... DATADIR\n" -"\n" -msgstr "" -"Anv�ndning:\n" -" %s [FLAGGA]... DATAKATALOG\n" -"\n" - -#: pg_resetxlog.c:1024 -#, c-format -msgid "Options:\n" -msgstr "Flaggor:\n" - -#: pg_resetxlog.c:1025 -#, c-format -msgid " -e XIDEPOCH set next transaction ID epoch\n" -msgstr " -x XIDEPOCH s�tt n�sta transaktions-ID-epoch\n" - -#: pg_resetxlog.c:1026 -#, c-format -msgid " -f force update to be done\n" -msgstr " -f forcera �terst�llande\n" - -#: pg_resetxlog.c:1027 -#, c-format -msgid "" -" -l TLI,FILE,SEG force minimum WAL starting location for new transaction " -"log\n" -msgstr "" -" -l TLI,FILID,SEG ange minsta WAL-startposition f�r ny transaktion\n" - -#: pg_resetxlog.c:1028 -#, c-format -msgid " -m XID set next multitransaction ID\n" -msgstr " -m XID s�tt n�sta multitransaktions-ID\n" - -#: pg_resetxlog.c:1029 -#, c-format -msgid "" -" -n no update, just show extracted control values (for " -"testing)\n" -msgstr "" -" -n ingen updatering, visa bara kontrollv�rden (f�r testning)\n" - -#: pg_resetxlog.c:1030 -#, c-format -msgid " -o OID set next OID\n" -msgstr " -o OID s�tt n�sta OID\n" - -#: pg_resetxlog.c:1031 -#, c-format -msgid " -O OFFSET set next multitransaction offset\n" -msgstr " -O OFFSET s�tt n�sta multitransaktionsoffset\n" - -#: pg_resetxlog.c:1032 -#, c-format -msgid " -x XID set next transaction ID\n" -msgstr " -x XID s�tt n�sta transaktions-ID\n" - -#: pg_resetxlog.c:1033 -#, c-format -msgid " --help show this help, then exit\n" -msgstr " --help visa denna hj�lp, avsluta sedan\n" - -#: pg_resetxlog.c:1034 -#, c-format -msgid " --version output version information, then exit\n" -msgstr " --version visa versionsinformation, avsluta sedan\n" - -#: pg_resetxlog.c:1035 -#, c-format -msgid "" -"\n" -"Report bugs to <[email protected]>.\n" -msgstr "" -"\n" -"Reportera fel till <[email protected]>.\n" diff --git a/src/bin/psql/command.c b/src/bin/psql/command.c index 72b60e40b0..1cc9553571 100644 --- a/src/bin/psql/command.c +++ b/src/bin/psql/command.c @@ -1677,7 +1677,11 @@ connection_warnings(bool in_startup) /* For version match, only print psql banner on startup. */ else if (in_startup) #ifdef PGXC +#ifdef XCP + printf("%s (PGXL %s, based on PG %s)\n", pset.progname, PGXC_VERSION, PG_VERSION); +#else printf("%s (PGXC %s, based on PG %s)\n", pset.progname, PGXC_VERSION, PG_VERSION); +#endif #else printf("%s (%s)\n", pset.progname, PG_VERSION); #endif diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c index 8a0beca3c9..6d2216b650 100644 --- a/src/bin/psql/startup.c +++ b/src/bin/psql/startup.c @@ -652,7 +652,11 @@ static void showVersion(void) { #ifdef PGXC +#ifdef XCP + puts("psql (Postgres-XL) " PGXC_VERSION); +#else puts("psql (Postgres-XC) " PGXC_VERSION); +#endif puts("(based on PostgreSQL) " PG_VERSION); #else puts("psql (PostgreSQL) " PG_VERSION); diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index 63052c5f0c..216b3e796a 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -700,7 +700,10 @@ static const pgsql_thing_t words_after_create[] = { {"TEMP", NULL, NULL, THING_NO_DROP}, /* for CREATE TEMP TABLE ... */ {"TEMPLATE", Query_for_list_of_ts_templates, NULL, THING_NO_SHOW}, {"TEXT SEARCH", NULL, NULL}, +#ifndef PGXC + /* PGXCTODO: This should be re-enabled once TRIGGER is supported */ {"TRIGGER", "SELECT pg_catalog.quote_ident(tgname) FROM pg_catalog.pg_trigger WHERE substring(pg_catalog.quote_ident(tgname),1,%d)='%s'"}, +#endif {"TYPE", NULL, &Query_for_list_of_datatypes}, {"UNIQUE", NULL, NULL, THING_NO_DROP}, /* for CREATE UNIQUE INDEX ... */ {"UNLOGGED", NULL, NULL, THING_NO_DROP}, /* for CREATE UNLOGGED TABLE @@ -787,7 +790,7 @@ psql_completion(char *text, int start, int end) static const char *const sql_commands[] = { #ifdef PGXC - /* + /* * Added "CLEAN" and "EXECUTE DIRECT" * Removed LISTEN, NOTIFY, RELEASE, SAVEPOINT and UNLISTEN */ @@ -886,13 +889,13 @@ psql_completion(char *text, int start, int end) /* * Added: "NODE" (NODE NAME cannot be altered). * Removed: "FOREIGN DATA WRAPPER", "FOREIGN TABLE", "LARGE OBJECT", - * "SERVER", "USER MAPPING FOR". + * "SERVER", "TRIGGER", "USER MAPPING FOR". */ {"AGGREGATE", "COLLATION", "CONVERSION", "DATABASE", "DEFAULT PRIVILEGES", "DOMAIN", "EXTENSION", "FUNCTION", "GROUP", "INDEX", "LANGUAGE", "NODE", "NODE GROUP", "OPERATOR", "ROLE", "SCHEMA", "SEQUENCE", "TABLE", - "TABLESPACE", "TEXT SEARCH", "TRIGGER", "TYPE", + "TABLESPACE", "TEXT SEARCH", "TYPE", "USER", "VIEW", NULL}; #else {"AGGREGATE", "COLLATION", "CONVERSION", "DATABASE", "DEFAULT PRIVILEGES", "DOMAIN", @@ -1261,6 +1264,8 @@ psql_completion(char *text, int start, int end) COMPLETE_WITH_LIST(list_ALTERVIEW); } +#ifndef PGXC + /* PGXCTODO: This should be re-enabled once TRIGGER is supported */ /* ALTER TRIGGER <name>, add ON */ else if (pg_strcasecmp(prev3_wd, "ALTER") == 0 && pg_strcasecmp(prev2_wd, "TRIGGER") == 0) @@ -1285,6 +1290,7 @@ psql_completion(char *text, int start, int end) else if (pg_strcasecmp(prev4_wd, "TRIGGER") == 0 && pg_strcasecmp(prev2_wd, "ON") == 0) COMPLETE_WITH_CONST("RENAME TO"); +#endif /* * If we detect ALTER TABLE <name>, suggest sub commands @@ -2081,6 +2087,8 @@ psql_completion(char *text, int start, int end) pg_strcasecmp(prev2_wd, "CONFIGURATION") == 0) COMPLETE_WITH_CONST("("); +#ifndef PGXC + /* PGXCTODO: This should be re-enabled once TRIGGER is supported */ /* CREATE TRIGGER */ /* complete CREATE TRIGGER <name> with BEFORE,AFTER */ else if (pg_strcasecmp(prev3_wd, "CREATE") == 0 && @@ -2147,6 +2155,7 @@ psql_completion(char *text, int start, int end) prev2_wd[0] != '\0') COMPLETE_WITH_CONST("PROCEDURE"); +#endif /* CREATE ROLE,USER,GROUP <name> */ else if (pg_strcasecmp(prev3_wd, "CREATE") == 0 && !(pg_strcasecmp(prev2_wd, "USER") == 0 && pg_strcasecmp(prev_wd, "MAPPING") == 0) && diff --git a/src/gtm/Makefile b/src/gtm/Makefile index 5059642637..480d1bf49e 100644 --- a/src/gtm/Makefile +++ b/src/gtm/Makefile @@ -12,6 +12,37 @@ subdir = src/gtm top_builddir = ../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = client common config libpq main path proxy recovery +WANTED_DIRS=common path libpq client recovery main proxy gtm_ctl -$(recurse) +all: + @for dir in $(WANTED_DIRS); do \ + $(MAKE) -C $$dir $@ || exit; \ + done + +clobber: + @for dir in $(WANTED_DIRS); do \ + $(MAKE) -C $$dir $@ || exit; \ + done + +clean: + @for dir in $(WANTED_DIRS); do \ + $(MAKE) -C $$dir $@ || exit; \ + done + +distclean: clean + +maintainer-clean: distclean + +install: all + $(INSTALL_PROGRAM) main/gtm$(X) '$(DESTDIR)$(bindir)/gtm$(X)' + $(INSTALL_PROGRAM) gtm_ctl/gtm_ctl$(X) '$(DESTDIR)$(bindir)/gtm_ctl$(X)' + $(INSTALL_PROGRAM) proxy/gtm_proxy$(X) '$(DESTDIR)$(bindir)/gtm_proxy$(X)' + $(INSTALL_DATA) $(srcdir)/main/gtm.conf.sample '$(DESTDIR)$(datadir)/gtm.conf.sample' + $(INSTALL_DATA) $(srcdir)/proxy/gtm_proxy.conf.sample '$(DESTDIR)$(datadir)/gtm_proxy.conf.sample' + +uninstall: + rm -f $(DESTDIR)$(bindir)/gtm$(X) + rm -f $(DESTDIR)$(bindir)/gtm_ctl$(X) + rm -f $(DESTDIR)$(bindir)/gtm_proxy$(X) + rm -f $(DESTDIR)$(datadir)/gtm.conf.sample + rm -f $(DESTDIR)$(datadir)/gtm_proxy.conf.sample diff --git a/src/gtm/client/Makefile b/src/gtm/client/Makefile index 56dba648ce..e8204bb4b9 100644 --- a/src/gtm/client/Makefile +++ b/src/gtm/client/Makefile @@ -11,20 +11,22 @@ top_builddir=../../.. include $(top_builddir)/src/Makefile.global subdir=src/gtm/client -override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) -LIBS += $(PTHREAD_LIBS) +NAME=gtmclient +SO_MAJOR_VERSION= 1 +SO_MINOR_VERSION= 0 -include $(top_srcdir)/src/backend/common.mk +OBJS=fe-misc.o fe-connect.o pqexpbuffer.o ip.o strlcpy.o gtm_client.o fe-protocol.o +LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq -OBJS = fe-misc.o fe-connect.o gtm_client.o fe-protocol.o ip.o pqexpbuffer.o +LIBS=-lpthread -all: libgtmclient.a +all:all-lib -libgtmclient.a: $(OBJS) - $(AR) $(AROPT) $@ $^ +include $(top_srcdir)/src/Makefile.shlib clean: - rm -f $(OBJS) libgtmclient.a + rm -f $(OBJS) + rm -f libgtmclient.a libgtmclient.so libgtmclient.so.1 libgtmclient.so.1.0 distclean: clean diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c index bfcb5f4e54..1bcb3e6a17 100644 --- a/src/gtm/client/fe-connect.c +++ b/src/gtm/client/fe-connect.c @@ -3,6 +3,11 @@ * fe-connect.c * functions related to setting up a connection to the backend * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -889,6 +894,39 @@ freeGTM_Conn(GTM_Conn *conn) free(conn->outBuffer); termGTMPQExpBuffer(&conn->errorMessage); termGTMPQExpBuffer(&conn->workBuffer); +#ifdef XCP + if (conn->result) + { + /* Free last snapshot if defined */ + if (conn->result->gr_snapshot.sn_xip) + free(conn->result->gr_snapshot.sn_xip); + + /* Depending on result type there could be allocated data */ + switch (conn->result->gr_type) + { + case SEQUENCE_INIT_RESULT: + case SEQUENCE_RESET_RESULT: + case SEQUENCE_CLOSE_RESULT: + case SEQUENCE_RENAME_RESULT: + case SEQUENCE_ALTER_RESULT: + case SEQUENCE_SET_VAL_RESULT: + if (conn->result->gr_resdata.grd_seqkey.gsk_key) + free(conn->result->gr_resdata.grd_seqkey.gsk_key); + break; + + case SEQUENCE_GET_NEXT_RESULT: + case SEQUENCE_GET_LAST_RESULT: + if (conn->result->gr_resdata.grd_seq.seqkey.gsk_key) + free(conn->result->gr_resdata.grd_seq.seqkey.gsk_key); + break; + + default: + break; + } + + free(conn->result); + } +#endif free(conn); } diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c index b4a0e3e2fc..1ebf067ec9 100644 --- a/src/gtm/client/fe-protocol.c +++ b/src/gtm/client/fe-protocol.c @@ -3,6 +3,11 @@ * fe-protocol3.c * functions that are specific to frontend/backend protocol version 3 * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -366,6 +371,11 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result) case END_BACKUP_RESULT: break; +#ifdef XCP + case REGISTER_SESSION_RESULT: + break; +#endif + case TXN_BEGIN_RESULT: if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txnhandle, sizeof (GTM_TransactionHandle), conn)) @@ -549,6 +559,7 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result) result->gr_status = GTM_RESULT_ERROR; break; + case SEQUENCE_GET_CURRENT_RESULT: case SEQUENCE_GET_NEXT_RESULT: case SEQUENCE_GET_LAST_RESULT: if (gtmpqReadSeqKey(&result->gr_resdata.grd_seq.seqkey, conn)) @@ -559,6 +570,12 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result) if (gtmpqGetnchar((char *)&result->gr_resdata.grd_seq.seqval, sizeof (GTM_Sequence), conn)) result->gr_status = GTM_RESULT_ERROR; +#ifdef XCP + if (result->gr_type == SEQUENCE_GET_NEXT_RESULT && + gtmpqGetnchar((char *)&result->gr_resdata.grd_seq.rangemax, + sizeof (GTM_Sequence), conn)) + result->gr_status = GTM_RESULT_ERROR; +#endif break; case SEQUENCE_LIST_RESULT: @@ -570,7 +587,7 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result) } result->gr_resdata.grd_seq_list.seq = - (GTM_SeqInfo **)malloc(sizeof(GTM_SeqInfo *) * + (GTM_SeqInfo **)malloc(sizeof(GTM_SeqInfo) * result->gr_resdata.grd_seq_list.seq_count); for (i = 0 ; i < result->gr_resdata.grd_seq_list.seq_count; i++) @@ -593,7 +610,8 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result) break; } - result->gr_resdata.grd_seq_list.seq[i] = gtm_deserialize_sequence(buf, buflen); + gtm_deserialize_sequence(result->gr_resdata.grd_seq_list.seq+i, + buf, buflen); free(buf); } @@ -733,7 +751,7 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result) for (i = 0 ; i < result->gr_resdata.grd_node_list.num_node; i++) { int size; - char buf[1024]; + char buf[8092]; GTM_PGXCNodeInfo *data = (GTM_PGXCNodeInfo *)malloc(sizeof(GTM_PGXCNodeInfo)); if (gtmpqGetInt(&size, sizeof(int32), conn)) @@ -741,19 +759,37 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result) result->gr_status = GTM_RESULT_ERROR; break; } + if (size > 8092) + { + result->gr_status = GTM_RESULT_ERROR; + printfGTMPQExpBuffer(&conn->errorMessage, "buffer size not large enough for node list data"); + result->gr_status = GTM_RESULT_ERROR; + } if (gtmpqGetnchar((char *)&buf, size, conn)) { result->gr_status = GTM_RESULT_ERROR; break; } - gtm_deserialize_pgxcnodeinfo(data, buf, size); - +#ifdef XCP + if (!gtm_deserialize_pgxcnodeinfo(data, buf, size, &conn->errorMessage)) + { + result->gr_status = GTM_RESULT_ERROR; + break; + } + else + { + result->gr_resdata.grd_node_list.nodeinfo[i] = data; + } +#else + gtm_deserialize_pgxcnodeinfo(data, buf, size, &conn->errorMessage); result->gr_resdata.grd_node_list.nodeinfo[i] = data; +#endif } break; } + default: printfGTMPQExpBuffer(&conn->errorMessage, "unexpected result type from server; result typr was \"%d\"\n", @@ -813,6 +849,7 @@ gtmpqFreeResultData(GTM_Result *result, GTM_PGXCNodeType remote_type) result->gr_resdata.grd_seqkey.gsk_key = NULL; break; + case SEQUENCE_GET_CURRENT_RESULT: case SEQUENCE_GET_NEXT_RESULT: case SEQUENCE_GET_LAST_RESULT: if (result->gr_resdata.grd_seq.seqkey.gsk_key != NULL) diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c index 0e396d5eb0..d099ba6729 100644 --- a/src/gtm/client/gtm_client.c +++ b/src/gtm/client/gtm_client.c @@ -2,6 +2,11 @@ * * gtm-client.c * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -51,8 +56,17 @@ static int abort_transaction_multi_internal(GTM_Conn *conn, int txn_count, Globa static int open_sequence_internal(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment, GTM_Sequence minval, GTM_Sequence maxval, GTM_Sequence startval, bool cycle, bool is_backup); +#ifdef XCP +static int get_next_internal(GTM_Conn *conn, GTM_SequenceKey key, + char *coord_name, int coord_procid, GTM_Sequence range, + GTM_Sequence *result, GTM_Sequence *rangemax, bool is_backup); +static int set_val_internal(GTM_Conn *conn, GTM_SequenceKey key, + char *coord_name, int coord_procid, GTM_Sequence nextval, + bool iscalled, bool is_backup); +#else static GTM_Sequence get_next_internal(GTM_Conn *conn, GTM_SequenceKey key, bool is_backup); static int set_val_internal(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence nextval, bool iscalled, bool is_backup); +#endif static int reset_sequence_internal(GTM_Conn *conn, GTM_SequenceKey key, bool is_backup); static int commit_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, bool is_backup); static int close_sequence_internal(GTM_Conn *conn, GTM_SequenceKey key, bool is_backup); @@ -229,6 +243,11 @@ get_node_list(GTM_Conn *conn, GTM_PGXCNodeInfo *data, size_t maxlen) num_node = res->gr_resdata.grd_node_list.num_node; fprintf(stderr, "get_node_list: num_node=%ld\n", num_node); + if (num_node > maxlen) + { + fprintf(stderr, "Error: number of nodes %zu greater than maximum", num_node); + goto receive_failed; + } for (i = 0; i < num_node; i++) { @@ -348,13 +367,14 @@ send_failed: * get_sequence_list() * * returns a number of sequences on success, -1 on failure. + * Returned seq_list is pointing to GTM_Result structure, the data should be + * copied before the next call to getResult. */ size_t -get_sequence_list(GTM_Conn *conn, GTM_SeqInfo **seq_list, size_t seq_max) +get_sequence_list(GTM_Conn *conn, GTM_SeqInfo **seq_list) { GTM_Result *res = NULL; time_t finish_time; - int i; /* Start the message. */ if (gtmpqPutMsgStart('C', true, conn) || @@ -380,15 +400,9 @@ get_sequence_list(GTM_Conn *conn, GTM_SeqInfo **seq_list, size_t seq_max) if (res->gr_status == GTM_RESULT_OK) Assert(res->gr_type == SEQUENCE_LIST_RESULT); - for (i = 0; i < res->gr_resdata.grd_seq_list.seq_count; i++) - { - seq_list[i] = res->gr_resdata.grd_seq_list.seq[i]; + *seq_list = res->gr_resdata.grd_seq_list.seq; - if ( i >= seq_max ) - break; - } - - return i; + return res->gr_resdata.grd_seq_list.seq_count; receive_failed: send_failed: @@ -1041,16 +1055,16 @@ open_sequence_internal(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increme gtmpqPutc(cycle, conn)) goto send_failed; - if (!is_backup) - { - /* Finish the message. */ - if (gtmpqPutMsgEnd(conn)) - goto send_failed; + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; - /* Flush to ensure backend gets it. */ - if (gtmpqFlush(conn)) - goto send_failed; + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + if (!is_backup) + { finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; if (gtmpqWaitTimed(true, false, conn, finish_time) || gtmpqReadData(conn) < 0) @@ -1248,6 +1262,111 @@ send_failed: return -1; } +#ifdef XCP +/* + * Request from GTM current value of the specified sequence in the specified + * distributed session. + * Function returns GTM_RESULT_OK if the current value is defined, it sets + * the *result parameter in this case. + * Other return value means a problem. Check GTMPQerrorMessage(conn) for details + * about the problem. + */ +int +get_current(GTM_Conn *conn, GTM_SequenceKey key, + char *coord_name, int coord_procid, GTM_Sequence *result) +#else +GTM_Sequence +get_current(GTM_Conn *conn, GTM_SequenceKey key) +#endif +{ + GTM_Result *res = NULL; + time_t finish_time; +#ifdef XCP + int coord_namelen = coord_name ? strlen(coord_name) : 0; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_SEQUENCE_GET_CURRENT, sizeof (GTM_MessageType), conn) || + gtmpqPutInt(key->gsk_keylen, 4, conn) || + gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn) || + gtmpqPutInt(coord_namelen, 4, conn) || + (coord_namelen > 0 && gtmpqPutnchar(coord_name, coord_namelen, conn)) || + gtmpqPutInt(coord_procid, 4, conn)) + goto send_failed; +#else + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_SEQUENCE_GET_CURRENT, sizeof (GTM_MessageType), conn) || + gtmpqPutInt(key->gsk_keylen, 4, conn) || + gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn)) + goto send_failed; +#endif + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + +#ifdef XCP + if (res->gr_status == GTM_RESULT_OK) + *result = res->gr_resdata.grd_seq.seqval; + + return res->gr_status; +#else + if (res->gr_status == GTM_RESULT_OK) + return res->gr_resdata.grd_seq.seqval; + else + return InvalidSequenceValue; +#endif + +receive_failed: +send_failed: + conn->result = makeEmptyResultIfIsNull(conn->result); + conn->result->gr_status = GTM_RESULT_COMM_ERROR; +#ifdef XCP + return GTM_RESULT_COMM_ERROR; +#else + return -1; +#endif +} + +#ifdef XCP +/* + * Submit to GTM new next value of the specified sequence in the specified + * distributed session. The nextval parameter is the new value, if is called + * is set to false the nextval will be the next value returned from the sequence + * by nextval() function, if true the function returns incremented value. + * Function returns GTM_RESULT_OK if it succeedes. + * Other return value means a problem. Check GTMPQerrorMessage(conn) for details + * about the problem. + */ +int +set_val(GTM_Conn *conn, GTM_SequenceKey key, char *coord_name, + int coord_procid, GTM_Sequence nextval, bool iscalled) +{ + return set_val_internal(conn, key, coord_name, coord_procid, nextval, + iscalled, false); +} + +int +bkup_set_val(GTM_Conn *conn, GTM_SequenceKey key, char *coord_name, + int coord_procid, GTM_Sequence nextval, bool iscalled) +{ + return set_val_internal(conn, key, coord_name, coord_procid, nextval, + iscalled, true); +} +#else int set_val(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence nextval, bool iscalled) { @@ -1259,18 +1378,34 @@ bkup_set_val(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence nextval, bool isc { return set_val_internal(conn, key, nextval, iscalled, true); } +#endif +#ifdef XCP +static int +set_val_internal(GTM_Conn *conn, GTM_SequenceKey key, + char *coord_name, int coord_procid, GTM_Sequence nextval, + bool iscalled, bool is_backup) +#else static int set_val_internal(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence nextval, bool iscalled, bool is_backup) +#endif { GTM_Result *res = NULL; time_t finish_time; +#ifdef XCP + int coord_namelen = coord_name ? strlen(coord_name) : 0; +#endif /* Start the message. */ if (gtmpqPutMsgStart('C', true, conn) || gtmpqPutInt(is_backup ? MSG_BKUP_SEQUENCE_SET_VAL : MSG_SEQUENCE_SET_VAL, sizeof (GTM_MessageType), conn) || gtmpqPutInt(key->gsk_keylen, 4, conn) || gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn) || +#ifdef XCP + gtmpqPutInt(coord_namelen, 4, conn) || + (coord_namelen > 0 && gtmpqPutnchar(coord_name, coord_namelen, conn)) || + gtmpqPutInt(coord_procid, 4, conn) || +#endif gtmpqPutnchar((char *)&nextval, sizeof (GTM_Sequence), conn) || gtmpqPutc(iscalled, conn)) goto send_failed; @@ -1301,9 +1436,39 @@ receive_failed: send_failed: conn->result = makeEmptyResultIfIsNull(conn->result); conn->result->gr_status = GTM_RESULT_COMM_ERROR; +#ifdef XCP + return GTM_RESULT_COMM_ERROR; +#else return -1; +#endif +} + +#ifdef XCP +/* + * Rexuest from GTM next value of the specified sequence. + * Function returns GTM_RESULT_OK if it succeedes, it sets the *result parameter + * in this case. + * Other return value means a problem. Check GTMPQerrorMessage(conn) for details + * about the problem. + */ +int +get_next(GTM_Conn *conn, GTM_SequenceKey key, + char *coord_name, int coord_procid, GTM_Sequence range, + GTM_Sequence *result, GTM_Sequence *rangemax) +{ + return get_next_internal(conn, key, coord_name, coord_procid, + range, result, rangemax, false); } +int +bkup_get_next(GTM_Conn *conn, GTM_SequenceKey key, + char *coord_name, int coord_procid, GTM_Sequence range, + GTM_Sequence *result, GTM_Sequence *rangemax) +{ + return get_next_internal(conn, key, coord_name, coord_procid, + range, result, rangemax, true); +} +#else GTM_Sequence get_next(GTM_Conn *conn, GTM_SequenceKey key) { @@ -1315,19 +1480,41 @@ bkup_get_next(GTM_Conn *conn, GTM_SequenceKey key) { return get_next_internal(conn, key, true); } +#endif +#ifdef XCP +static int +get_next_internal(GTM_Conn *conn, GTM_SequenceKey key, + char *coord_name, int coord_procid, GTM_Sequence range, + GTM_Sequence *result, GTM_Sequence *rangemax, bool is_backup) +#else static GTM_Sequence get_next_internal(GTM_Conn *conn, GTM_SequenceKey key, bool is_backup) +#endif { GTM_Result *res = NULL; time_t finish_time; +#ifdef XCP + int coord_namelen = coord_name ? strlen(coord_name) : 0; /* Start the message. */ if (gtmpqPutMsgStart('C', true, conn) || gtmpqPutInt(is_backup ? MSG_BKUP_SEQUENCE_GET_NEXT : MSG_SEQUENCE_GET_NEXT, sizeof (GTM_MessageType), conn) || gtmpqPutInt(key->gsk_keylen, 4, conn) || + gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn) || + gtmpqPutInt(coord_namelen, 4, conn) || + (coord_namelen > 0 && gtmpqPutnchar(coord_name, coord_namelen, conn)) || + gtmpqPutInt(coord_procid, 4, conn) || + gtmpqPutnchar((char *)&range, sizeof (GTM_Sequence), conn)) + goto send_failed; +#else + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(is_backup ? MSG_BKUP_SEQUENCE_GET_NEXT : MSG_SEQUENCE_GET_NEXT, sizeof (GTM_MessageType), conn) || + gtmpqPutInt(key->gsk_keylen, 4, conn) || gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn)) goto send_failed; +#endif /* Finish the message. */ if (gtmpqPutMsgEnd(conn)) @@ -1347,10 +1534,19 @@ get_next_internal(GTM_Conn *conn, GTM_SequenceKey key, bool is_backup) if ((res = GTMPQgetResult(conn)) == NULL) goto receive_failed; +#ifdef XCP + if (res->gr_status == GTM_RESULT_OK) + { + *result = res->gr_resdata.grd_seq.seqval; + *rangemax = res->gr_resdata.grd_seq.rangemax; + } + return res->gr_status; +#else if (res->gr_status == GTM_RESULT_OK) return res->gr_resdata.grd_seq.seqval; else return InvalidSequenceValue; +#endif } return GTM_RESULT_OK; @@ -1358,7 +1554,11 @@ receive_failed: send_failed: conn->result = makeEmptyResultIfIsNull(conn->result); conn->result->gr_status = GTM_RESULT_COMM_ERROR; +#ifdef XCP + return GTM_RESULT_COMM_ERROR; +#else return -1; +#endif } int @@ -2116,3 +2316,67 @@ send_failed: conn->result->gr_status = GTM_RESULT_COMM_ERROR; return -1; } + + +#ifdef XCP +/* + * Submit to GTM information about started distributed session. + * The information is the session identifier consisting of coordinator name and + * pid of the master process, and the BackendId of the master process. + * The BackendId is used to track session end. BackendIds are the sequential + * numbers from 1 to max_connections, and they are unique among active sessions + * under the same postmaster. So if another session on the same coordinator with + * the same BackendId is registering, that means the previous session is closed + * and all resources assigned to it could be released. + */ +int +register_session(GTM_Conn *conn, const char *coord_name, int coord_procid, + int coord_backendid) +{ + GTM_Result *res = NULL; + time_t finish_time; + int32 len = strlen(coord_name); + + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_REGISTER_SESSION, sizeof (GTM_MessageType), conn) || + gtmpqPutInt(len, sizeof(len), conn) || + gtmpqPutnchar(coord_name, len, conn) || + gtmpqPutInt(coord_procid, sizeof(coord_procid), conn) || + gtmpqPutInt(coord_backendid, sizeof(coord_backendid), conn)) + { + goto send_failed; + } + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + { + goto send_failed; + } + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + { + goto send_failed; + } + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + { + goto receive_failed; + } + + if ((res = GTMPQgetResult(conn)) == NULL) + { + goto receive_failed; + } + + return res->gr_status; + +receive_failed: +send_failed: + conn->result = makeEmptyResultIfIsNull(conn->result); + conn->result->gr_status = GTM_RESULT_COMM_ERROR; + return -1; +} +#endif diff --git a/src/gtm/client/strlcpy.c b/src/gtm/client/strlcpy.c new file mode 100644 index 0000000000..48cdf5e2c9 --- /dev/null +++ b/src/gtm/client/strlcpy.c @@ -0,0 +1,72 @@ +/*------------------------------------------------------------------------- + * + * strlcpy.c + * strncpy done right + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/port/strlcpy.c,v 1.5 2008/01/01 19:46:00 momjian Exp $ + * + * This file was taken from OpenBSD and is used on platforms that don't + * provide strlcpy(). The OpenBSD copyright terms follow. + *------------------------------------------------------------------------- + */ + +/* $OpenBSD: strlcpy.c,v 1.11 2006/05/05 15:27:38 millert Exp $ */ + +/* + * Copyright (c) 1998 Todd C. Miller <[email protected]> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "gtm/gtm_c.h" + + +/* + * Copy src to string dst of size siz. At most siz-1 characters + * will be copied. Always NUL terminates (unless siz == 0). + * Returns strlen(src); if retval >= siz, truncation occurred. + * Function creation history: http://www.gratisoft.us/todd/papers/strlcpy.html + */ +size_t +strlcpy(char *dst, const char *src, size_t siz) +{ + char *d = dst; + const char *s = src; + size_t n = siz; + + /* Copy as many bytes as will fit */ + if (n != 0) + { + while (--n != 0) + { + if ((*d++ = *s++) == '\0') + break; + } + } + + /* Not enough room in dst, add NUL and traverse rest of src */ + if (n == 0) + { + if (siz != 0) + *d = '\0'; /* NUL-terminate dst */ + while (*s++) + ; + } + + return (s - src - 1); /* count does not include NUL */ +} diff --git a/src/gtm/common/.gitignore b/src/gtm/common/.gitignore new file mode 100644 index 0000000000..5963e7b19a --- /dev/null +++ b/src/gtm/common/.gitignore @@ -0,0 +1 @@ +/gtm_opt_scanner.c diff --git a/src/gtm/common/Makefile b/src/gtm/common/Makefile index c43e000ead..31e0c25ff9 100644 --- a/src/gtm/common/Makefile +++ b/src/gtm/common/Makefile @@ -8,24 +8,41 @@ # #----------------------------------------------------------------------------- top_builddir=../../.. -include $(top_builddir)/src/Makefile.global subdir=src/gtm/common -override CPPFLAGS := -I. -I$(libpq_srcdir) $(CPPFLAGS) -LIBS += $(PTHREAD_LIBS) +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS) + +NAME=gtm + +SO_MAJOR_VERSION= 1 +SO_MINOR_VERSION= 0 + +LDFLAGS=-L$(top_builddir)/common -L$(top_builddir)/libpq +LIBS=-lpthread + +OBJS = gtm_opt_handler.o aset.o mcxt.o gtm_utils.o elog.o assert.o stringinfo.o gtm_lock.o \ + gtm_list.o gtm_serialize.o gtm_serialize_debug.o -include $(top_srcdir)/src/backend/common.mk +all:all-lib -OBJS = gtm_utils.o gtm_lock.o gtm_serialize.o gtm_serialize_debug.o \ - aset.o assert.o elog.o mcxt.o stringinfo.o gtm_list.o +gtm_opt_handler.o: gtm_opt_scanner.c -all: libgtmcommon.a +gtm_opt_scanner.c: gtm_opt_scanner.l +ifdef FLEX + $(FLEX) $(FLEXFLAGS) -o'$@' $< +else + @$(missing) flex $< $@ +endif -libgtmcommon.a: $(OBJS) - $(AR) $(AROPT) $@ $^ +# Shared library stuff +include $(top_srcdir)/src/Makefile.shlib +# Note that gtm_opt_scanner.c is not deleted by make clean as we want it in distribution tarballs clean: - rm -f $(OBJS) libgtmcommon.a + rm -f $(OBJS) + rm -f libgtm.so libgtm.so.1 libgtm.so.1.0 distclean: clean diff --git a/src/gtm/common/gtm_opt_handler.c b/src/gtm/common/gtm_opt_handler.c new file mode 100644 index 0000000000..61c2476599 --- /dev/null +++ b/src/gtm/common/gtm_opt_handler.c @@ -0,0 +1,3509 @@ +/* -*-pgsql-c-*- */ +/* + * Scanner for the configuration file + * + * Copyright (c) 2000-2011, PostgreSQL Global Development Group + * + * src/backend/utils/misc/guc-file.l + */ + +#include "gtm/gtm.h" + +#include <ctype.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <stdlib.h> + +#include "mb/pg_wchar.h" +#include "gtm/path.h" +#include "gtm/assert.h" +#include "gtm/gtm_opt.h" +#include "gtm/gtm_opt_tables.h" +#include "gtm/elog.h" +#include "gtm_opt_scanner.c" + +/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */ +#undef fprintf +#define fprintf(file, fmt, msg) ereport(ERROR, (errmsg_internal("%s", msg))) + +static unsigned int ConfigFileLineno; + +/* flex fails to supply a prototype for GTMOPT_yylex, so provide one */ +int GTMOPT_GTMOPT_yylex(void); + +/* Functions defined in this file */ +static char *GTMOPT_scanstr(const char *s); +static struct config_generic *find_option(const char *name, bool create_placeholders, int elevel); +static char *gtm_opt_strdup(int elevel, const char *src); +static int gtm_opt_name_compare(const char *namea, const char *nameb); +struct config_generic **get_gtm_opt_variables(void); +void build_gtm_opt_variables(void); +static bool gtm_opt_parse_bool(const char *value, bool *result); +static bool gtm_opt_parse_bool_with_len(const char *value, size_t len, bool *result); +static void set_config_sourcefile(const char *name, char *sourcefile, int sourceline); +static int gtm_opt_var_compare(const void *a, const void *b); +static void InitializeOneGTMOption(struct config_generic * gconf); +static void ReportGTMOption(struct config_generic * record); +static char *_ShowOption(struct config_generic * record, bool use_units); + +/* + * Variables to bel fed by specific option definition: gtm_opt.c and gtm_proxy_opt.c + */ +extern char *GTMConfigFileName; +extern char *data_directory; +extern struct config_generic **gtm_opt_variables; +extern int num_gtm_opt_variables; +extern int size_gtm_opt_variables; +extern bool reporting_enabled; /* TRUE to enable GTMOPT_REPORT */ +extern char *config_filename; /* Default configuration file name */ +extern int GTMOptUpdateCount; /* Indicates when specific option is updated */ +extern bool isStartUp; + +/* + * Tables of options: to be defined in gtm_opt.c and gtm_proxy_opt.c + */ +extern struct config_bool ConfigureNamesBool[]; +extern struct config_int ConfigureNamesInt[]; +extern struct config_real ConfigureNamesReal[]; +extern struct config_string ConfigureNamesString[]; +extern struct config_enum ConfigureNamesEnum[]; + +/* + * Note: MAX_BACKENDS is limited to 2^23-1 because inval.c stores the + * backend ID as a 3-byte signed integer. Even if that limitation were + * removed, we still could not exceed INT_MAX/4 because some places compute + * 4*MaxBackends without any overflow check. This is rechecked in + * check_maxconnections, since MaxBackends is computed as MaxConnections + * plus autovacuum_max_workers plus one (for the autovacuum launcher). + */ +#define MAX_BACKENDS 0x7fffff + +#define KB_PER_MB (1024) +#define KB_PER_GB (1024*1024) + +#define MS_PER_S 1000 +#define S_PER_MIN 60 +#define MS_PER_MIN (1000 * 60) +#define MIN_PER_H 60 +#define S_PER_H (60 * 60) +#define MS_PER_H (1000 * 60 * 60) +#define MIN_PER_D (60 * 24) +#define S_PER_D (60 * 60 * 24) +#define MS_PER_D (1000 * 60 * 60 * 24) + +/* + * Exported function to read and process the configuration file. The + * parameter indicates in what context the file is being read --- either + * postmaster startup (including standalone-backend startup) or SIGHUP. + * All options mentioned in the configuration file are set to new values. + * If an error occurs, no values will be changed. + */ +void +ProcessConfigFile(GtmOptContext context) +{ + int elevel; + ConfigVariable *item, + *head, + *tail; + char *cvc = NULL; + int i; + + Assert((context == GTMC_STARTUP || context == GTMC_SIGHUP)); + + if (context == GTMC_SIGHUP) + elevel = DEBUG2; + else + elevel = ERROR; + + /* Parse the file into a list of option names and values */ + head = tail = NULL; + + if (!ParseConfigFile(GTMConfigFileName, NULL, 0, elevel, &head, &tail)) + goto cleanup_list; + +#if 0 + /* No custom_variable_classes now */ + /* + * This part of the code remained the same as original guc.c because + * we might want to have custom variable class for gtm.conf. + */ + /* + * We need the proposed new value of custom_variable_classes to check + * custom variables with. ParseConfigFile ensured that if it's in + * the file, it's first in the list. But first check to see if we + * have an active value from the command line, which should override + * the file in any case. (Since there's no relevant env var, the + * only possible nondefault sources are the file and ARGV.) + */ + cvc_struct = (struct config_string *) + find_option("custom_variable_classes", false, elevel); + Assert(cvc_struct); + if (cvc_struct->gen.reset_source > GTMC_S_FILE) + { + cvc = gtm_opt_strdup(elevel, cvc_struct->reset_val); + if (cvc == NULL) + goto cleanup_list; + } + else if (head != NULL && + gtm_opt_name_compare(head->name, "custom_variable_classes") == 0) + { + /* + * Need to canonicalize the value by calling the check hook. + */ + void *extra = NULL; + + cvc = gtm_opt_strdup(elevel, head->value); + if (cvc == NULL) + goto cleanup_list; + if (extra) + free(extra); + } +#endif + + /* + * Mark all extant GUC variables as not present in the config file. + * We need this so that we can tell below which ones have been removed + * from the file since we last processed it. + */ + for (i = 0; i < num_gtm_opt_variables; i++) + { + struct config_generic *gconf = gtm_opt_variables[i]; + + gconf->status &= ~GTMOPT_IS_IN_FILE; + } + + /* + * Check if all options are valid. As a side-effect, the GTMOPT_IS_IN_FILE + * flag is set on each GUC variable mentioned in the list. + */ + for (item = head; item; item = item->next) + { + char *sep = strchr(item->name, GTMOPT_QUALIFIER_SEPARATOR); + + if (sep) + { + /* + * There is no GUC entry. If we called set_config_option then + * it would make a placeholder, which we don't want to do yet, + * since we could still fail further down the list. Do nothing + * (assuming that making the placeholder will succeed later). + */ + if (find_option(item->name, false, elevel) == NULL) + continue; + /* + * 3. There is already a GUC entry (either real or placeholder) for + * the variable. In this case we should let set_config_option + * check it, since the assignment could well fail if it's a real + * entry. + */ + } + + if (!set_config_option(item->name, item->value, context, + GTMC_S_FILE, false)) + goto cleanup_list; + } + + /* + * Check for variables having been removed from the config file, and + * revert their reset values (and perhaps also effective values) to the + * boot-time defaults. If such a variable can't be changed after startup, + * just throw a warning and continue. (This is analogous to the fact that + * set_config_option only throws a warning for a new but different value. + * If we wanted to make it a hard error, we'd need an extra pass over the + * list so that we could throw the error before starting to apply + * changes.) + */ + for (i = 0; i < num_gtm_opt_variables; i++) + { + struct config_generic *gconf = gtm_opt_variables[i]; + GtmOptStack *stack; + + if (gconf->reset_source != GTMC_S_FILE || + (gconf->status & GTMOPT_IS_IN_FILE)) + continue; + if (gconf->context < GTMC_SIGHUP) + { + /* + * In the original code, errcode() stores specified error code to sqlerrcode, which does not + * exist in GTM. + */ + if (isStartUp) + { + write_stderr("parameter \"%s\" cannot be changed without restarting the server", + gconf->name); + } + else + { + ereport(elevel, + (0, + errmsg("parameter \"%s\" cannot be changed without restarting the server", + gconf->name))); + } + continue; + } + + /* + * Reset any "file" sources to "default", else set_config_option + * will not override those settings. + */ + if (gconf->reset_source == GTMC_S_FILE) + gconf->reset_source = GTMC_S_DEFAULT; + if (gconf->source == GTMC_S_FILE) + gconf->source = GTMC_S_DEFAULT; + for (stack = gconf->stack; stack; stack = stack->prev) + { + if (stack->source == GTMC_S_FILE) + stack->source = GTMC_S_DEFAULT; + } + + /* Now we can re-apply the wired-in default (i.e., the boot_val) */ + set_config_option(gconf->name, NULL, context, GTMC_S_DEFAULT, + true); + if (context == GTMC_SIGHUP) + { + if (isStartUp) + { + write_stderr("parameter \"%s\" removed from configuration file, reset to default\n", + gconf->name); + } + else + { + ereport(elevel, + (errmsg("parameter \"%s\" removed from configuration file, reset to default", + gconf->name))); + } + } + } + + /* + * Restore any variables determined by environment variables or + * dynamically-computed defaults. This is a no-op except in the case + * where one of these had been in the config file and is now removed. + * + * In particular, we *must not* do this during the postmaster's + * initial loading of the file, since the timezone functions in + * particular should be run only after initialization is complete. + * + * XXX this is an unmaintainable crock, because we have to know how + * to set (or at least what to call to set) every variable that could + * potentially have GTMC_S_DYNAMIC_DEFAULT or GTMC_S_ENV_VAR source. + * However, there's no time to redesign it for 9.1. + */ + + /* If we got here all the options checked out okay, so apply them. */ + for (item = head; item; item = item->next) + { + char *pre_value = NULL; + + if (set_config_option(item->name, item->value, context, + GTMC_S_FILE, true)) + { + set_config_sourcefile(item->name, item->filename, + item->sourceline); + + if (pre_value) + { + const char *post_value = GetConfigOption(item->name, false); + + if (!post_value) + post_value = ""; + if (strcmp(pre_value, post_value) != 0) + { + if (isStartUp) + { + write_stderr("parameter \"%s\" changed to \"%s\"\n", + item->name, item->value); + } + else + { + ereport(elevel, + (errmsg("parameter \"%s\" changed to \"%s\"", + item->name, item->value))); + } + } + } + } + + if (pre_value) + free(pre_value); + } + + /* PGXCTODO: configuration file reload time update */ + +cleanup_list: + FreeConfigVariables(head); + if (cvc) + free(cvc); +} + +/* + * See next function for details. This one will just work with a config_file + * name rather than an already opened File Descriptor + */ +bool +ParseConfigFile(const char *config_file, const char *calling_file, + int depth, int elevel, + ConfigVariable **head_p, + ConfigVariable **tail_p) +{ + bool OK = true; + FILE *fp; + char abs_path[MAXPGPATH]; + + /* + * Reject too-deep include nesting depth. This is just a safety check + * to avoid dumping core due to stack overflow if an include file loops + * back to itself. The maximum nesting depth is pretty arbitrary. + */ + if (depth > 10) + { + if (isStartUp) + { + write_stderr("could not open configuration file \"%s\": maximum nesting depth exceeded\n", + config_file); + } + else + { + ereport(elevel, + (0, + errmsg("could not open configuration file \"%s\": maximum nesting depth exceeded", + config_file))); + } + return false; + } + + /* + * If config_file is a relative path, convert to absolute. We consider + * it to be relative to the directory holding the calling file. + */ + if (!is_absolute_path(config_file)) + { + if (calling_file != NULL) + { + strlcpy(abs_path, calling_file, sizeof(abs_path)); + get_parent_directory(abs_path); + join_path_components(abs_path, abs_path, config_file); + canonicalize_path(abs_path); + config_file = abs_path; + } + else + { + /* + * calling_file is NULL, we make an absolute path from $PGDATA + */ + join_path_components(abs_path, data_directory, config_file); + canonicalize_path(abs_path); + config_file = abs_path; + } + } + + fp = fopen(config_file, "r"); + if (!fp) + { + if (isStartUp) + { + write_stderr("could not open configuration file \"%s\": %m\n", + config_file); + } + else + { + ereport(elevel, + (0, + errmsg("could not open configuration file \"%s\": %m", + config_file))); + } + return false; + } + + OK = ParseConfigFp(fp, config_file, depth, elevel, head_p, tail_p); + + fclose(fp); + + return OK; +} + +/* + * Read and parse a single configuration file. This function recurses + * to handle "include" directives. + * + * Input parameters: + * fp: file pointer from AllocateFile for the configuration file to parse + * config_file: absolute or relative path of file to read + * depth: recursion depth (used only to prevent infinite recursion) + * elevel: error logging level determined by ProcessConfigFile() + * Output parameters: + * head_p, tail_p: head and tail of linked list of name/value pairs + * + * *head_p and *tail_p must be initialized to NULL before calling the outer + * recursion level. On exit, they contain a list of name-value pairs read + * from the input file(s). + * + * Returns TRUE if successful, FALSE if an error occurred. The error has + * already been ereport'd, it is only necessary for the caller to clean up + * its own state and release the name/value pairs list. + * + * Note: if elevel >= ERROR then an error will not return control to the + * caller, and internal state such as open files will not be cleaned up. + * This case occurs only during postmaster or standalone-backend startup, + * where an error will lead to immediate process exit anyway; so there is + * no point in contorting the code so it can clean up nicely. + */ +bool +ParseConfigFp(FILE *fp, const char *config_file, int depth, int elevel, + ConfigVariable **head_p, ConfigVariable **tail_p) +{ + bool OK = true; + YY_BUFFER_STATE lex_buffer; + int token; + + /* + * Parse + */ + lex_buffer = GTMOPT_yy_create_buffer(fp, YY_BUF_SIZE); + GTMOPT_yy_switch_to_buffer(lex_buffer); + + ConfigFileLineno = 1; + + /* This loop iterates once per logical line */ + while ((token = GTMOPT_yylex())) + { + char *opt_name, *opt_value; + ConfigVariable *item; + + if (token == GTMOPT_EOL) /* empty or comment line */ + continue; + + /* first token on line is option name */ + if (token != GTMOPT_ID && token != GTMOPT_QUALIFIED_ID) + goto parse_error; + opt_name = strdup(GTMOPT_yytext); + + /* next we have an optional equal sign; discard if present */ + token = GTMOPT_yylex(); + if (token == GTMOPT_EQUALS) + token = GTMOPT_yylex(); + + /* now we must have the option value */ + if (token != GTMOPT_ID && + token != GTMOPT_STRING && + token != GTMOPT_INTEGER && + token != GTMOPT_REAL && + token != GTMOPT_UNQUOTED_STRING) + goto parse_error; + if (token == GTMOPT_STRING) /* strip quotes and escapes */ + opt_value = GTMOPT_scanstr(GTMOPT_yytext); + else + opt_value = strdup(GTMOPT_yytext); + + /* now we'd like an end of line, or possibly EOF */ + token = GTMOPT_yylex(); + if (token != GTMOPT_EOL) + { + if (token != 0) + goto parse_error; + /* treat EOF like \n for line numbering purposes, cf bug 4752 */ + ConfigFileLineno++; + } + + /* OK, process the option name and value */ + if (gtm_opt_name_compare(opt_name, "include") == 0) + { + /* + * An include directive isn't a variable and should be processed + * immediately. + */ + unsigned int save_ConfigFileLineno = ConfigFileLineno; + + if (!ParseConfigFile(opt_value, config_file, + depth + 1, elevel, + head_p, tail_p)) + { + free(opt_name); + free(opt_value); + OK = false; + goto cleanup_exit; + } + GTMOPT_yy_switch_to_buffer(lex_buffer); + ConfigFileLineno = save_ConfigFileLineno; + free(opt_name); + free(opt_value); + } + else if (gtm_opt_name_compare(opt_name, "custom_variable_classes") == 0) + { + /* + * This variable must be processed first as it controls + * the validity of other variables; so it goes at the head + * of the result list. If we already found a value for it, + * replace with this one. + */ + item = *head_p; + if (item != NULL && + gtm_opt_name_compare(item->name, "custom_variable_classes") == 0) + { + /* replace existing head item */ + free(item->name); + free(item->value); + item->name = opt_name; + item->value = opt_value; + item->filename = strdup(config_file); + item->sourceline = ConfigFileLineno-1; + } + else + { + /* prepend to list */ + item = malloc(sizeof *item); + item->name = opt_name; + item->value = opt_value; + item->filename = strdup(config_file); + item->sourceline = ConfigFileLineno-1; + item->next = *head_p; + *head_p = item; + if (*tail_p == NULL) + *tail_p = item; + } + } + else + { + /* ordinary variable, append to list */ + item = malloc(sizeof *item); + item->name = opt_name; + item->value = opt_value; + item->filename = strdup(config_file); + item->sourceline = ConfigFileLineno-1; + item->next = NULL; + if (*head_p == NULL) + *head_p = item; + else + (*tail_p)->next = item; + *tail_p = item; + } + + /* break out of loop if read EOF, else loop for next line */ + if (token == 0) + break; + } + + /* successful completion of parsing */ + goto cleanup_exit; + + parse_error: + if (token == GTMOPT_EOL || token == 0) + { + if (isStartUp) + { + write_stderr("syntax error in file \"%s\" line %u, near end of line\n", + config_file, ConfigFileLineno - 1); + } + else + { + ereport(elevel, + (0, + errmsg("syntax error in file \"%s\" line %u, near end of line", + config_file, ConfigFileLineno - 1))); + } + } + else + { + if (isStartUp) + { + write_stderr("syntax error in file \"%s\" line %u, near token \"%s\"\n", + config_file, ConfigFileLineno, GTMOPT_yytext); + } + else + { + ereport(elevel, + (0, + errmsg("syntax error in file \"%s\" line %u, near token \"%s\"", + config_file, ConfigFileLineno, GTMOPT_yytext))); + } + } + OK = false; + +cleanup_exit: + GTMOPT_yy_delete_buffer(lex_buffer); + return OK; +} + + +/* + * Free a list of ConfigVariables, including the names and the values + */ +void +FreeConfigVariables(ConfigVariable *list) +{ + ConfigVariable *item; + + item = list; + while (item) + { + ConfigVariable *next = item->next; + + free(item->name); + free(item->value); + free(item->filename); + free(item); + item = next; + } +} + + +/* + * scanstr + * + * Strip the quotes surrounding the given string, and collapse any embedded + * '' sequences and backslash escapes. + * + * the string returned is malloc'd and should eventually be free'd by the + * caller. + */ +static char * +GTMOPT_scanstr(const char *s) +{ + char *newStr; + int len, + i, + j; + + Assert(s != NULL && s[0] == '\''); + len = strlen(s); + Assert(len >= 2); + Assert(s[len-1] == '\''); + + /* Skip the leading quote; we'll handle the trailing quote below */ + s++, len--; + + /* Since len still includes trailing quote, this is enough space */ + newStr = malloc(len); + + for (i = 0, j = 0; i < len; i++) + { + if (s[i] == '\\') + { + i++; + switch (s[i]) + { + case 'b': + newStr[j] = '\b'; + break; + case 'f': + newStr[j] = '\f'; + break; + case 'n': + newStr[j] = '\n'; + break; + case 'r': + newStr[j] = '\r'; + break; + case 't': + newStr[j] = '\t'; + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + { + int k; + long octVal = 0; + + for (k = 0; + s[i + k] >= '0' && s[i + k] <= '7' && k < 3; + k++) + octVal = (octVal << 3) + (s[i + k] - '0'); + i += k - 1; + newStr[j] = ((char) octVal); + } + break; + default: + newStr[j] = s[i]; + break; + } /* switch */ + } + else if (s[i] == '\'' && s[i+1] == '\'') + { + /* doubled quote becomes just one quote */ + newStr[j] = s[++i]; + } + else + newStr[j] = s[i]; + j++; + } + + /* We copied the ending quote to newStr, so replace with \0 */ + Assert(j > 0 && j <= len); + newStr[--j] = '\0'; + + return newStr; +} + +/* + * The following code includes most of the code ported from guc.c. + * Because they should be shared by gtm_opt.c and gtm_proxy_opt.c, they are placed here. + */ + +/* + * Some infrastructure for checking malloc/strdup/realloc calls + */ +static void * +gtm_opt_malloc(int elevel, size_t size) +{ + void *data; + + data = malloc(size); + if (data == NULL) + { + if (isStartUp) + { + write_stderr("out of memory\n"); + } + else + { + ereport(elevel, + (0, + errmsg("out of memory"))); + } + } + return data; +} + +#if 0 +/* PGXCTODO: this will be used for future extensions */ +static void * +gtm_opt_realloc(int elevel, void *old, size_t size) +{ + void *data; + + data = realloc(old, size); + if (data == NULL) + { + if (isStartUp) + { + write_stderr("out of memory\n"); + } + else + { + ereport(elevel, + (0, + errmsg("out of memory"))); + } + } + return data; +} +#endif + +static char * +gtm_opt_strdup(int elevel, const char *src) +{ + char *data; + + data = strdup(src); + if (data == NULL) + { + if (isStartUp) + { + write_stderr("out of memory\n"); + } + else + { + ereport(elevel, + (0, + errmsg("out of memory"))); + } + } + return data; +} + +/* + * Detect whether strval is referenced anywhere in a GTM string item + */ +static bool +string_field_used(struct config_string * conf, char *strval) +{ + GtmOptStack *stack; + + if (strval == *(conf->variable) || + strval == conf->reset_val || + strval == conf->boot_val) + return true; + for (stack = conf->gen.stack; stack; stack = stack->prev) + { + if (strval == stack->prior.val.stringval || + strval == stack->masked.val.stringval) + return true; + } + return false; +} + + +/* + * Support for assigning to a field of a string GTM item. Free the prior + * value if it's not referenced anywhere else in the item (including stacked + * states). + */ +static void +set_string_field(struct config_string * conf, char **field, char *newval) +{ + char *oldval = *field; + + /* Do the assignment */ + *field = newval; + + /* Free old value if it's not NULL and isn't referenced anymore */ + if (oldval && !string_field_used(conf, oldval)) + free(oldval); +} + + +/* + * Detect whether an "extra" struct is referenced anywhere in a GTM item + */ +static bool +extra_field_used(struct config_generic * gconf, void *extra) +{ + GtmOptStack *stack; + + if (extra == gconf->extra) + return true; + switch (gconf->vartype) + { + case GTMC_BOOL: + if (extra == ((struct config_bool *) gconf)->reset_extra) + return true; + break; + case GTMC_INT: + if (extra == ((struct config_int *) gconf)->reset_extra) + return true; + break; + case GTMC_REAL: + if (extra == ((struct config_real *) gconf)->reset_extra) + return true; + break; + case GTMC_STRING: + if (extra == ((struct config_string *) gconf)->reset_extra) + return true; + break; + case GTMC_ENUM: + if (extra == ((struct config_enum *) gconf)->reset_extra) + return true; + break; + } + for (stack = gconf->stack; stack; stack = stack->prev) + { + if (extra == stack->prior.extra || + extra == stack->masked.extra) + return true; + } + + return false; +} + + +/* + * Support for assigning to an "extra" field of a GTM item. Free the prior + * value if it's not referenced anywhere else in the item (including stacked + * states). + */ +static void +set_extra_field(struct config_generic * gconf, void **field, void *newval) +{ + void *oldval = *field; + + /* Do the assignment */ + *field = newval; + + /* Free old value if it's not NULL and isn't referenced anymore */ + if (oldval && !extra_field_used(gconf, oldval)) + free(oldval); +} + + +/* + * Support for copying a variable's active value into a stack entry. + * The "extra" field associated with the active value is copied, too. + * + * NB: be sure stringval and extra fields of a new stack entry are + * initialized to NULL before this is used, else we'll try to free() them. + */ +static void +set_stack_value(struct config_generic * gconf, config_var_value *val) +{ + switch (gconf->vartype) + { + case GTMC_BOOL: + val->val.boolval = + *((struct config_bool *) gconf)->variable; + break; + case GTMC_INT: + val->val.intval = + *((struct config_int *) gconf)->variable; + break; + case GTMC_REAL: + val->val.realval = + *((struct config_real *) gconf)->variable; + break; + case GTMC_STRING: + set_string_field((struct config_string *) gconf, + &(val->val.stringval), + *((struct config_string *) gconf)->variable); + break; + case GTMC_ENUM: + val->val.enumval = + *((struct config_enum *) gconf)->variable; + break; + } + set_extra_field(gconf, &(val->extra), gconf->extra); +} + +#if 0 +/* PGXCTODO: This is let for future extension support */ +/* + * Support for discarding a no-longer-needed value in a stack entry. + * The "extra" field associated with the stack entry is cleared, too. + */ +static void +discard_stack_value(struct config_generic * gconf, config_var_value *val) +{ + switch (gconf->vartype) + { + case GTMC_BOOL: + case GTMC_INT: + case GTMC_REAL: + case GTMC_ENUM: + /* no need to do anything */ + break; + case GTMC_STRING: + set_string_field((struct config_string *) gconf, + &(val->val.stringval), + NULL); + break; + } + set_extra_field(gconf, &(val->extra), NULL); +} +#endif + +/* + * Fetch the sorted array pointer (exported for help_config.c's use ONLY) + */ +struct config_generic ** +get_gtm_opt_variables(void) +{ + return gtm_opt_variables; +} + +/* + * Build the sorted array. This is split out so that it could be + * re-executed after startup (eg, we could allow loadable modules to + * add vars, and then we'd need to re-sort). + */ +void +build_gtm_opt_variables(void) +{ + int size_vars; + int num_vars = 0; + struct config_generic **gtm_opt_vars; + int i; + + for (i = 0; ConfigureNamesBool[i].gen.name; i++) + { + struct config_bool *conf = &ConfigureNamesBool[i]; + + /* Rather than requiring vartype to be filled in by hand, do this: */ + conf->gen.vartype = GTMC_BOOL; + num_vars++; + } + + for (i = 0; ConfigureNamesInt[i].gen.name; i++) + { + struct config_int *conf = &ConfigureNamesInt[i]; + + conf->gen.vartype = GTMC_INT; + num_vars++; + } + + for (i = 0; ConfigureNamesReal[i].gen.name; i++) + { + struct config_real *conf = &ConfigureNamesReal[i]; + + conf->gen.vartype = GTMC_REAL; + num_vars++; + } + + for (i = 0; ConfigureNamesString[i].gen.name; i++) + { + struct config_string *conf = &ConfigureNamesString[i]; + + conf->gen.vartype = GTMC_STRING; + num_vars++; + } + + for (i = 0; ConfigureNamesEnum[i].gen.name; i++) + { + struct config_enum *conf = &ConfigureNamesEnum[i]; + + conf->gen.vartype = GTMC_ENUM; + num_vars++; + } + + /* + * Create table with 20% slack + */ + size_vars = num_vars + num_vars / 4; + + gtm_opt_vars = (struct config_generic **) + gtm_opt_malloc(FATAL, size_vars * sizeof(struct config_generic *)); + + num_vars = 0; + + for (i = 0; ConfigureNamesBool[i].gen.name; i++) + gtm_opt_vars[num_vars++] = &ConfigureNamesBool[i].gen; + + for (i = 0; ConfigureNamesInt[i].gen.name; i++) + gtm_opt_vars[num_vars++] = &ConfigureNamesInt[i].gen; + + for (i = 0; ConfigureNamesReal[i].gen.name; i++) + gtm_opt_vars[num_vars++] = &ConfigureNamesReal[i].gen; + + for (i = 0; ConfigureNamesString[i].gen.name; i++) + gtm_opt_vars[num_vars++] = &ConfigureNamesString[i].gen; + + for (i = 0; ConfigureNamesEnum[i].gen.name; i++) + gtm_opt_vars[num_vars++] = &ConfigureNamesEnum[i].gen; + + if (gtm_opt_variables) + free(gtm_opt_variables); + gtm_opt_variables = gtm_opt_vars; + num_gtm_opt_variables = num_vars; + size_gtm_opt_variables = size_vars; + qsort((void *) gtm_opt_variables, num_gtm_opt_variables, + sizeof(struct config_generic *), gtm_opt_var_compare); +} + + +#if 0 +/* PGXCTODO: This is let for future extension support */ +/* + * Add a new GTM variable to the list of known variables. The + * list is expanded if needed. + */ +static bool +add_gtm_opt_variable(struct config_generic * var, int elevel) +{ + if (num_gtm_opt_variables + 1 >= size_gtm_opt_variables) + { + /* + * Increase the vector by 25% + */ + int size_vars = size_gtm_opt_variables + size_gtm_opt_variables / 4; + struct config_generic **gtm_opt_vars; + + if (size_vars == 0) + { + size_vars = 100; + gtm_opt_vars = (struct config_generic **) + gtm_opt_malloc(elevel, size_vars * sizeof(struct config_generic *)); + } + else + { + gtm_opt_vars = (struct config_generic **) + gtm_opt_realloc(elevel, gtm_opt_variables, size_vars * sizeof(struct config_generic *)); + } + + if (gtm_opt_vars == NULL) + return false; /* out of memory */ + + gtm_opt_variables = gtm_opt_vars; + size_gtm_opt_variables = size_vars; + } + gtm_opt_variables[num_gtm_opt_variables++] = var; + qsort((void *) gtm_opt_variables, num_gtm_opt_variables, + sizeof(struct config_generic *), gtm_opt_var_compare); + return true; +} + + +/* + * Create and add a placeholder variable. It's presumed to belong + * to a valid custom variable class at this point. + */ +static struct config_generic * +add_placeholder_variable(const char *name, int elevel) +{ + size_t sz = sizeof(struct config_string) + sizeof(char *); + struct config_string *var; + struct config_generic *gen; + + var = (struct config_string *) gtm_opt_malloc(elevel, sz); + if (var == NULL) + return NULL; + memset(var, 0, sz); + gen = &var->gen; + + gen->name = gtm_opt_strdup(elevel, name); + if (gen->name == NULL) + { + free(var); + return NULL; + } + + gen->context = GTMC_USERSET; + gen->short_desc = "GTM placeholder variable"; + gen->flags = GTMOPT_NO_SHOW_ALL | GTMOPT_NOT_IN_SAMPLE | GTMOPT_CUSTOM_PLACEHOLDER; + gen->vartype = GTMC_STRING; + + /* + * The char* is allocated at the end of the struct since we have no + * 'static' place to point to. Note that the current value, as well as + * the boot and reset values, start out NULL. + */ + var->variable = (char **) (var + 1); + + if (!add_gtm_opt_variable((struct config_generic *) var, elevel)) + { + free((void *) gen->name); + free(var); + return NULL; + } + + return gen; +} +#endif + +/* + * Look up option NAME. If it exists, return a pointer to its record, + * else return NULL. If create_placeholders is TRUE, we'll create a + * placeholder record for a valid-looking custom variable name. + */ +static struct config_generic * +find_option(const char *name, bool create_placeholders, int elevel) +{ + const char **key = &name; + struct config_generic **res; + + Assert(name); + + /* + * By equating const char ** with struct config_generic *, we are assuming + * the name field is first in config_generic. + */ + res = (struct config_generic **) bsearch((void *) &key, + (void *) gtm_opt_variables, + num_gtm_opt_variables, + sizeof(struct config_generic *), + gtm_opt_var_compare); + if (res) + return *res; + + /* Unknown name */ + return NULL; +} + + +/* + * comparator for qsorting and bsearching gtm_opt_variables array + */ +static int +gtm_opt_var_compare(const void *a, const void *b) +{ + struct config_generic *confa = *(struct config_generic **) a; + struct config_generic *confb = *(struct config_generic **) b; + + return gtm_opt_name_compare(confa->name, confb->name); +} + + +/* + * the bare comparison function for GTM names + */ +static int +gtm_opt_name_compare(const char *namea, const char *nameb) +{ + /* + * The temptation to use strcasecmp() here must be resisted, because the + * array ordering has to remain stable across setlocale() calls. So, build + * our own with a simple ASCII-only downcasing. + */ + while (*namea && *nameb) + { + char cha = *namea++; + char chb = *nameb++; + + if (cha >= 'A' && cha <= 'Z') + cha += 'a' - 'A'; + if (chb >= 'A' && chb <= 'Z') + chb += 'a' - 'A'; + if (cha != chb) + return cha - chb; + } + if (*namea) + return 1; /* a is longer */ + if (*nameb) + return -1; /* b is longer */ + return 0; +} + + +/* + * Initialize GTM options during program startup. + * + * Note that we cannot read the config file yet, since we have not yet + * processed command-line switches. + */ +void +InitializeGTMOptions(void) +{ + int i; + + /* + * Build sorted array of all GTM variables. + */ + build_gtm_opt_variables(); + + /* + * Load all variables with their compiled-in defaults, and initialize + * status fields as needed. + */ + for (i = 0; i < num_gtm_opt_variables; i++) + { + InitializeOneGTMOption(gtm_opt_variables[i]); + } + + reporting_enabled = false; + +} + + +/* + * Initialize one GTM option variable to its compiled-in default. + * + * Note: the reason for calling check_hooks is not that we think the boot_val + * might fail, but that the hooks might wish to compute an "extra" struct. + */ +static void +InitializeOneGTMOption(struct config_generic * gconf) +{ + gconf->status = 0; + gconf->reset_source = GTMC_S_DEFAULT; + gconf->source = GTMC_S_DEFAULT; + gconf->stack = NULL; + gconf->extra = NULL; + gconf->sourcefile = NULL; + gconf->sourceline = 0; + gconf->context = GTMC_DEFAULT; + + switch (gconf->vartype) + { + case GTMC_BOOL: + { + struct config_bool *conf = (struct config_bool *) gconf; + bool newval = conf->boot_val; + void *extra = NULL; + + *conf->variable = conf->reset_val = newval; + conf->gen.extra = conf->reset_extra = extra; + break; + } + case GTMC_INT: + { + struct config_int *conf = (struct config_int *) gconf; + int newval = conf->boot_val; + void *extra = NULL; + + Assert(newval >= conf->min); + Assert(newval <= conf->max); + *conf->variable = conf->reset_val = newval; + conf->gen.extra = conf->reset_extra = extra; + break; + } + case GTMC_REAL: + { + struct config_real *conf = (struct config_real *) gconf; + double newval = conf->boot_val; + void *extra = NULL; + + Assert(newval >= conf->min); + Assert(newval <= conf->max); + *conf->variable = conf->reset_val = newval; + conf->gen.extra = conf->reset_extra = extra; + break; + } + case GTMC_STRING: + { + struct config_string *conf = (struct config_string *) gconf; + char *newval; + void *extra = NULL; + + /* non-NULL boot_val must always get strdup'd */ + if (conf->boot_val != NULL) + newval = gtm_opt_strdup(FATAL, conf->boot_val); + else + newval = NULL; + + *conf->variable = conf->reset_val = newval; + conf->gen.extra = conf->reset_extra = extra; + break; + } + case GTMC_ENUM: + { + struct config_enum *conf = (struct config_enum *) gconf; + int newval = conf->boot_val; + void *extra = NULL; + + *conf->variable = conf->reset_val = newval; + conf->gen.extra = conf->reset_extra = extra; + break; + } + } +} + + +/* + * Select the configuration files and data directory to be used, and + * do the initial read of postgresql.conf. + * + * This is called after processing command-line switches. + * userDoption is the -D switch value if any (NULL if unspecified). + * progname is just for use in error messages. + * + * Returns true on success; on failure, prints a suitable error message + * to stderr and returns false. + */ +bool +SelectConfigFiles(const char *userDoption, const char *progname) +{ + char *configdir; + char *fname; + struct stat stat_buf; + + /* configdir is -D option, or $PGDATA if no -D */ + if (userDoption) + configdir = make_absolute_path(userDoption); + else + configdir = NULL; + + /* + * Find the configuration file: if config_file was specified on the + * command line, use it, else use configdir/postgresql.conf. In any case + * ensure the result is an absolute path, so that it will be interpreted + * the same way by future backends. + */ + if (GTMConfigFileName) + { + if (GTMConfigFileName[0] == '/') + fname = make_absolute_path(GTMConfigFileName); + else + { + if (configdir) + { + fname = gtm_opt_malloc(FATAL, + strlen(configdir) + strlen(GTMConfigFileName) + 2); + sprintf(fname, "%s/%s", configdir, GTMConfigFileName); + } + else + fname = make_absolute_path(GTMConfigFileName); + } + } + else if (configdir) + { + fname = gtm_opt_malloc(FATAL, + strlen(configdir) + strlen(config_filename) + 2); + sprintf(fname, "%s/%s", configdir, config_filename); + } + else + { + write_stderr("%s does not know where to find the server configuration file.\n" + "You must specify the --config-file or -D invocation " + "option or set the PGDATA environment variable.\n", + progname); + return false; + } + + /* + * Set the GTMConfigFileName GTM variable to its final value, ensuring that + * it can't be overridden later. + */ + SetConfigOption("config_file", fname, GTMC_STARTUP, GTMC_S_OVERRIDE); + free(fname); + + /* + * Now read the config file for the first time. + */ + if (stat(GTMConfigFileName, &stat_buf) != 0) + { + write_stderr("%s cannot access the server configuration file \"%s\": %s\n", + progname, GTMConfigFileName, strerror(errno)); + return false; + } + + ProcessConfigFile(GTMC_STARTUP); + + free(configdir); + + return true; +} + +/* + * Reset all options to their saved default values (implements RESET ALL) + */ +void +ResetAllOptions(void) +{ + int i; + + for (i = 0; i < num_gtm_opt_variables; i++) + { + struct config_generic *gconf = gtm_opt_variables[i]; + + /* Don't reset if special exclusion from RESET ALL */ + if (gconf->flags & GTMOPT_NO_RESET_ALL) + continue; + /* No need to reset if wasn't SET */ + if (gconf->source <= GTMC_S_OVERRIDE) + continue; + + switch (gconf->vartype) + { + case GTMC_BOOL: + { + struct config_bool *conf = (struct config_bool *) gconf; + + *conf->variable = conf->reset_val; + set_extra_field(&conf->gen, &conf->gen.extra, + conf->reset_extra); + break; + } + case GTMC_INT: + { + struct config_int *conf = (struct config_int *) gconf; + + *conf->variable = conf->reset_val; + set_extra_field(&conf->gen, &conf->gen.extra, + conf->reset_extra); + break; + } + case GTMC_REAL: + { + struct config_real *conf = (struct config_real *) gconf; + + *conf->variable = conf->reset_val; + set_extra_field(&conf->gen, &conf->gen.extra, + conf->reset_extra); + break; + } + case GTMC_STRING: + { + struct config_string *conf = (struct config_string *) gconf; + + set_string_field(conf, conf->variable, conf->reset_val); + set_extra_field(&conf->gen, &conf->gen.extra, + conf->reset_extra); + break; + } + case GTMC_ENUM: + { + struct config_enum *conf = (struct config_enum *) gconf; + + *conf->variable = conf->reset_val; + set_extra_field(&conf->gen, &conf->gen.extra, + conf->reset_extra); + break; + } + } + + gconf->source = gconf->reset_source; + + if (gconf->flags & GTMOPT_REPORT) + ReportGTMOption(gconf); + } +} + + + +/* + * push_old_value + * Push previous state during transactional assignment to a GTM variable. + */ +static void +push_old_value(struct config_generic * gconf) +{ + GtmOptStack *stack; + + /* If we're not inside a nest level, do nothing */ + if (GTMOptUpdateCount == 0) + return; + + /* Do we already have a stack entry of the current nest level? */ + stack = gconf->stack; + if (stack && stack->nest_level >= GTMOptUpdateCount) + return; + + /* + * Push a new stack entry + * + * We keep all the stack entries in TopTransactionContext for simplicity. + */ + stack = (GtmOptStack *) MemoryContextAllocZero(TopMemoryContext, + sizeof(GtmOptStack)); + + stack->prev = gconf->stack; + stack->nest_level = GTMOptUpdateCount; + stack->source = gconf->source; + set_stack_value(gconf, &stack->prior); + + gconf->stack = stack; +} + + + +/* + * Enter a new nesting level for GTM values. This is called at subtransaction + * start and when entering a function that has proconfig settings. NOTE that + * we must not risk error here, else subtransaction start will be unhappy. + */ +int +NewGTMNestLevel(void) +{ + return ++GTMOptUpdateCount; +} + +/* + * Try to parse value as an integer. The accepted formats are the + * usual decimal, octal, or hexadecimal formats, optionally followed by + * a unit name if "flags" indicates a unit is allowed. + * + * If the string parses okay, return true, else false. + * If okay and result is not NULL, return the value in *result. + * If not okay and hintmsg is not NULL, *hintmsg is set to a suitable + * HINT message, or NULL if no hint provided. + */ +bool +parse_int(const char *value, int *result, int flags, const char **hintmsg) +{ + int64 val; + char *endptr; + + /* To suppress compiler warnings, always set output params */ + if (result) + *result = 0; + if (hintmsg) + *hintmsg = NULL; + + /* We assume here that int64 is at least as wide as long */ + errno = 0; + val = strtol(value, &endptr, 0); + + if (endptr == value) + return false; /* no HINT for integer syntax error */ + + if (errno == ERANGE || val != (int64) ((int32) val)) + { + if (hintmsg) + *hintmsg = gettext_noop("Value exceeds integer range."); + return false; + } + + /* allow whitespace between integer and unit */ + while (isspace((unsigned char) *endptr)) + endptr++; + + /* Handle possible unit */ + if (*endptr != '\0') + { + /* + * Note: the multiple-switch coding technique here is a bit tedious, + * but seems necessary to avoid intermediate-value overflows. + */ + if (flags & GTMOPT_UNIT_MEMORY) + { + /* Set hint for use if no match or trailing garbage */ + if (hintmsg) + *hintmsg = gettext_noop("Valid units for this parameter are \"kB\", \"MB\", and \"GB\"."); + +#if BLCKSZ < 1024 || BLCKSZ > (1024*1024) +#error BLCKSZ must be between 1KB and 1MB +#endif +#if XLOG_BLCKSZ < 1024 || XLOG_BLCKSZ > (1024*1024) +#error XLOG_BLCKSZ must be between 1KB and 1MB +#endif + + if (strncmp(endptr, "kB", 2) == 0) + { + endptr += 2; + switch (flags & GTMOPT_UNIT_MEMORY) + { + case GTMOPT_UNIT_BLOCKS: + val /= (BLCKSZ / 1024); + break; + case GTMOPT_UNIT_XBLOCKS: + val /= (XLOG_BLCKSZ / 1024); + break; + } + } + else if (strncmp(endptr, "MB", 2) == 0) + { + endptr += 2; + switch (flags & GTMOPT_UNIT_MEMORY) + { + case GTMOPT_UNIT_KB: + val *= KB_PER_MB; + break; + case GTMOPT_UNIT_BLOCKS: + val *= KB_PER_MB / (BLCKSZ / 1024); + break; + case GTMOPT_UNIT_XBLOCKS: + val *= KB_PER_MB / (XLOG_BLCKSZ / 1024); + break; + } + } + else if (strncmp(endptr, "GB", 2) == 0) + { + endptr += 2; + switch (flags & GTMOPT_UNIT_MEMORY) + { + case GTMOPT_UNIT_KB: + val *= KB_PER_GB; + break; + case GTMOPT_UNIT_BLOCKS: + val *= KB_PER_GB / (BLCKSZ / 1024); + break; + case GTMOPT_UNIT_XBLOCKS: + val *= KB_PER_GB / (XLOG_BLCKSZ / 1024); + break; + } + } + } + else if (flags & GTMOPT_UNIT_TIME) + { + /* Set hint for use if no match or trailing garbage */ + if (hintmsg) + *hintmsg = gettext_noop("Valid units for this parameter are \"ms\", \"s\", \"min\", \"h\", and \"d\"."); + + if (strncmp(endptr, "ms", 2) == 0) + { + endptr += 2; + switch (flags & GTMOPT_UNIT_TIME) + { + case GTMOPT_UNIT_S: + val /= MS_PER_S; + break; + case GTMOPT_UNIT_MIN: + val /= MS_PER_MIN; + break; + } + } + else if (strncmp(endptr, "s", 1) == 0) + { + endptr += 1; + switch (flags & GTMOPT_UNIT_TIME) + { + case GTMOPT_UNIT_MS: + val *= MS_PER_S; + break; + case GTMOPT_UNIT_MIN: + val /= S_PER_MIN; + break; + } + } + else if (strncmp(endptr, "min", 3) == 0) + { + endptr += 3; + switch (flags & GTMOPT_UNIT_TIME) + { + case GTMOPT_UNIT_MS: + val *= MS_PER_MIN; + break; + case GTMOPT_UNIT_S: + val *= S_PER_MIN; + break; + } + } + else if (strncmp(endptr, "h", 1) == 0) + { + endptr += 1; + switch (flags & GTMOPT_UNIT_TIME) + { + case GTMOPT_UNIT_MS: + val *= MS_PER_H; + break; + case GTMOPT_UNIT_S: + val *= S_PER_H; + break; + case GTMOPT_UNIT_MIN: + val *= MIN_PER_H; + break; + } + } + else if (strncmp(endptr, "d", 1) == 0) + { + endptr += 1; + switch (flags & GTMOPT_UNIT_TIME) + { + case GTMOPT_UNIT_MS: + val *= MS_PER_D; + break; + case GTMOPT_UNIT_S: + val *= S_PER_D; + break; + case GTMOPT_UNIT_MIN: + val *= MIN_PER_D; + break; + } + } + } + + /* allow whitespace after unit */ + while (isspace((unsigned char) *endptr)) + endptr++; + + if (*endptr != '\0') + return false; /* appropriate hint, if any, already set */ + + /* Check for overflow due to units conversion */ + if (val != (int64) ((int32) val)) + { + if (hintmsg) + *hintmsg = gettext_noop("Value exceeds integer range."); + return false; + } + } + + if (result) + *result = (int) val; + return true; +} + + + +/* + * Try to parse value as a floating point number in the usual format. + * If the string parses okay, return true, else false. + * If okay and result is not NULL, return the value in *result. + */ +bool +parse_real(const char *value, double *result) +{ + double val; + char *endptr; + + if (result) + *result = 0; /* suppress compiler warning */ + + errno = 0; + val = strtod(value, &endptr); + if (endptr == value || errno == ERANGE) + return false; + + /* allow whitespace after number */ + while (isspace((unsigned char) *endptr)) + endptr++; + if (*endptr != '\0') + return false; + + if (result) + *result = val; + return true; +} + + + +/* + * Lookup the value for an enum option with the selected name + * (case-insensitive). + * If the enum option is found, sets the retval value and returns + * true. If it's not found, return FALSE and retval is set to 0. + */ +bool +config_enum_lookup_by_name(struct config_enum * record, const char *value, + int *retval) +{ + const struct config_enum_entry *entry; + + for (entry = record->options; entry && entry->name; entry++) + { + if (pg_strcasecmp(value, entry->name) == 0) + { + *retval = entry->val; + return TRUE; + } + } + + *retval = 0; + return FALSE; +} + + + +/* + * Return a list of all available options for an enum, excluding + * hidden ones, separated by the given separator. + * If prefix is non-NULL, it is added before the first enum value. + * If suffix is non-NULL, it is added to the end of the string. + */ +static char * +config_enum_get_options(struct config_enum * record, const char *prefix, + const char *suffix, const char *separator) +{ + const struct config_enum_entry *entry; + StringInfoData retstr; + int seplen; + + initStringInfo(&retstr); + appendStringInfoString(&retstr, prefix); + + seplen = strlen(separator); + for (entry = record->options; entry && entry->name; entry++) + { + if (!entry->hidden) + { + appendStringInfoString(&retstr, entry->name); + appendBinaryStringInfo(&retstr, separator, seplen); + } + } + + /* + * All the entries may have been hidden, leaving the string empty if no + * prefix was given. This indicates a broken GTM setup, since there is no + * use for an enum without any values, so we just check to make sure we + * don't write to invalid memory instead of actually trying to do + * something smart with it. + */ + if (retstr.len >= seplen) + { + /* Replace final separator */ + retstr.data[retstr.len - seplen] = '\0'; + retstr.len -= seplen; + } + + appendStringInfoString(&retstr, suffix); + + return retstr.data; +} + + +/* + * Sets option `name' to given value. The value should be a string + * which is going to be parsed and converted to the appropriate data + * type. The context and source parameters indicate in which context this + * function is being called so it can apply the access restrictions + * properly. + * + * If value is NULL, set the option to its default value (normally the + * reset_val, but if source == GTMC_S_DEFAULT we instead use the boot_val). + * + * action indicates whether to set the value globally in the session, locally + * to the current top transaction, or just for the duration of a function call. + * + * If changeVal is false then don't really set the option but do all + * the checks to see if it would work. + * + * If there is an error (non-existing option, invalid value) then an + * ereport(ERROR) is thrown *unless* this is called in a context where we + * don't want to ereport (currently, startup or SIGHUP config file reread). + * In that case we write a suitable error message via ereport(LOG) and + * return false. This is working around the deficiencies in the ereport + * mechanism, so don't blame me. In all other cases, the function + * returns true, including cases where the input is valid but we chose + * not to apply it because of context or source-priority considerations. + * + * See also SetConfigOption for an external interface. + */ +bool +set_config_option(const char *name, const char *value, + GtmOptContext context, GtmOptSource source, + bool changeVal) +{ + struct config_generic *record; + int elevel; + bool prohibitValueChange = false; + bool makeDefault; + + if (context == GTMC_SIGHUP || source == GTMC_S_DEFAULT) + { + /* + * To avoid cluttering the log, only the postmaster bleats loudly + * about problems with the config file. + */ + elevel = DEBUG3; + } + else if (source == GTMC_S_DATABASE || source == GTMC_S_USER || + source == GTMC_S_DATABASE_USER) + elevel = WARNING; + else + elevel = ERROR; + + record = find_option(name, true, elevel); + if (record == NULL) + { + if (isStartUp) + { + write_stderr("unrecognized configuration parameter \"%s\"\n", name); + } + else + { + ereport(elevel, + (0, + errmsg("unrecognized configuration parameter \"%s\"", name))); + } + return false; + } + + /* + * If source is postgresql.conf, mark the found record with + * GTMOPT_IS_IN_FILE. This is for the convenience of ProcessConfigFile. Note + * that we do it even if changeVal is false, since ProcessConfigFile wants + * the marking to occur during its testing pass. + */ + if (source == GTMC_S_FILE) + record->status |= GTMOPT_IS_IN_FILE; + + /* + * Check if the option can be set at this time. See guc.h for the precise + * rules. + */ + switch (record->context) + { + case GTMC_DEFAULT: + case GTMC_STARTUP: + if (context == GTMC_SIGHUP) + { + /* + * We are re-reading a GTMC_POSTMASTER variable from + * postgresql.conf. We can't change the setting, so we should + * give a warning if the DBA tries to change it. However, + * because of variant formats, canonicalization by check + * hooks, etc, we can't just compare the given string directly + * to what's stored. Set a flag to check below after we have + * the final storable value. + * + * During the "checking" pass we just do nothing, to avoid + * printing the warning twice. + */ + if (!changeVal) + return true; + + prohibitValueChange = true; + } + else if (context != GTMC_STARTUP) + { + if (isStartUp) + { + write_stderr("parameter \"%s\" cannot be changed without restarting the server\n", + name); + } + else + { + ereport(elevel, + (0, + errmsg("parameter \"%s\" cannot be changed without restarting the server", + name))); + } + return false; + } + break; + case GTMC_SIGHUP: + if (context != GTMC_SIGHUP && context != GTMC_STARTUP) + { + if (isStartUp) + { + write_stderr("parameter \"%s\" cannot be changed now\n", + name); + } + else + { + ereport(elevel, + (0, + errmsg("parameter \"%s\" cannot be changed now", + name))); + } + return false; + } + + /* + * Hmm, the idea of the SIGHUP context is "ought to be global, but + * can be changed after postmaster start". But there's nothing + * that prevents a crafty administrator from sending SIGHUP + * signals to individual backends only. + */ + break; + default: + if (isStartUp) + { + write_stderr("GtmOptContext invalid (%d)\n", + context); + } + else + { + ereport(elevel, + (0, + errmsg("GtmOptContext invalid (%d)", + context))); + } + return false; + } + + /* + * Should we set reset/stacked values? (If so, the behavior is not + * transactional.) This is done either when we get a default value from + * the database's/user's/client's default settings or when we reset a + * value to its default. + */ + makeDefault = changeVal && (source <= GTMC_S_OVERRIDE) && + ((value != NULL) || source == GTMC_S_DEFAULT); + + /* + * Ignore attempted set if overridden by previously processed setting. + * However, if changeVal is false then plow ahead anyway since we are + * trying to find out if the value is potentially good, not actually use + * it. Also keep going if makeDefault is true, since we may want to set + * the reset/stacked values even if we can't set the variable itself. + */ + if (record->source > source) + { + if (changeVal && !makeDefault) + { + if (isStartUp) + { + write_stderr("\"%s\": setting ignored because previous source is higher priority\n", + name); + } + else + { + elog(DEBUG3, "\"%s\": setting ignored because previous source is higher priority", + name); + } + return true; + } + changeVal = false; + } + + /* + * Evaluate value and set variable. + */ + switch (record->vartype) + { + case GTMC_BOOL: + { + struct config_bool *conf = (struct config_bool *) record; + bool newval; + void *newextra = NULL; + + if (value) + { + if (!gtm_opt_parse_bool(value, &newval)) + { + if (isStartUp) + { + write_stderr("parameter \"%s\" requires a Boolean value\n", + name); + } + else + { + ereport(elevel, + (0, + errmsg("parameter \"%s\" requires a Boolean value", + name))); + } + return false; + } + } + else if (source == GTMC_S_DEFAULT) + { + newval = conf->boot_val; + } + else + { + newval = conf->reset_val; + newextra = conf->reset_extra; + source = conf->gen.reset_source; + } + + if (prohibitValueChange) + { + if (*conf->variable != newval) + { + if (isStartUp) + { + write_stderr("parameter \"%s\" cannot be changed without restarting the server\n", + name); + } + else + { + ereport(elevel, + (0, + errmsg("parameter \"%s\" cannot be changed without restarting the server", + name))); + } + } + return false; + } + + if (changeVal) + { + /* Save old value to support transaction abort */ + if (!makeDefault) + push_old_value(&conf->gen); + + *conf->variable = newval; + set_extra_field(&conf->gen, &conf->gen.extra, + newextra); + conf->gen.source = source; + } + if (makeDefault) + { + GtmOptStack *stack; + + if (conf->gen.reset_source <= source) + { + conf->reset_val = newval; + set_extra_field(&conf->gen, &conf->reset_extra, + newextra); + conf->gen.reset_source = source; + } + for (stack = conf->gen.stack; stack; stack = stack->prev) + { + if (stack->source <= source) + { + stack->prior.val.boolval = newval; + set_extra_field(&conf->gen, &stack->prior.extra, + newextra); + stack->source = source; + } + } + } + + /* Perhaps we didn't install newextra anywhere */ + if (newextra && !extra_field_used(&conf->gen, newextra)) + free(newextra); + break; + } + + case GTMC_INT: + { + struct config_int *conf = (struct config_int *) record; + int newval; + void *newextra = NULL; + + if (value) + { + const char *hintmsg; + + if (!parse_int(value, &newval, conf->gen.flags, &hintmsg)) + { + if (isStartUp) + { + write_stderr("invalid value for parameter \"%s\": \"%s\"\n", + name, value); + } + else + { + ereport(elevel, + (0, + errmsg("invalid value for parameter \"%s\": \"%s\"", + name, value), + hintmsg ? errhint("%s", _(hintmsg)) : 0)); + } + return false; + } + if (newval < conf->min || newval > conf->max) + { + if (isStartUp) + { + write_stderr("%d is outside the valid range for parameter \"%s\" (%d .. %d)\n", + newval, name, conf->min, conf->max); + } + else + { + ereport(elevel, + (0, + errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)", + newval, name, conf->min, conf->max))); + } + return false; + } + } + else if (source == GTMC_S_DEFAULT) + { + newval = conf->boot_val; + } + else + { + newval = conf->reset_val; + newextra = conf->reset_extra; + source = conf->gen.reset_source; + } + + if (prohibitValueChange) + { + if (*conf->variable != newval) + { + if (isStartUp) + { + write_stderr("parameter \"%s\" cannot be changed without restarting the server\n", + name); + } + else + { + ereport(elevel, + (0, + errmsg("parameter \"%s\" cannot be changed without restarting the server", + name))); + } + } + return false; + } + + if (changeVal) + { + /* Save old value to support transaction abort */ + if (!makeDefault) + push_old_value(&conf->gen); + + *conf->variable = newval; + set_extra_field(&conf->gen, &conf->gen.extra, + newextra); + conf->gen.source = source; + } + if (makeDefault) + { + GtmOptStack *stack; + + if (conf->gen.reset_source <= source) + { + conf->reset_val = newval; + set_extra_field(&conf->gen, &conf->reset_extra, + newextra); + conf->gen.reset_source = source; + } + for (stack = conf->gen.stack; stack; stack = stack->prev) + { + if (stack->source <= source) + { + stack->prior.val.intval = newval; + set_extra_field(&conf->gen, &stack->prior.extra, + newextra); + stack->source = source; + } + } + } + + /* Perhaps we didn't install newextra anywhere */ + if (newextra && !extra_field_used(&conf->gen, newextra)) + free(newextra); + break; + } + + case GTMC_REAL: + { + struct config_real *conf = (struct config_real *) record; + double newval; + void *newextra = NULL; + + if (value) + { + if (!parse_real(value, &newval)) + { + if (isStartUp) + { + write_stderr("parameter \"%s\" requires a numeric value\n", + name); + } + else + { + ereport(elevel, + (0, + errmsg("parameter \"%s\" requires a numeric value", + name))); + } + return false; + } + if (newval < conf->min || newval > conf->max) + { + if (isStartUp) + { + write_stderr("%g is outside the valid range for parameter \"%s\" (%g .. %g)\n", + newval, name, conf->min, conf->max); + } + else + { + ereport(elevel, + (0, + errmsg("%g is outside the valid range for parameter \"%s\" (%g .. %g)", + newval, name, conf->min, conf->max))); + } + return false; + } + } + else if (source == GTMC_S_DEFAULT) + { + newval = conf->boot_val; + } + else + { + newval = conf->reset_val; + newextra = conf->reset_extra; + source = conf->gen.reset_source; + } + + if (prohibitValueChange) + { + if (*conf->variable != newval) + { + if (isStartUp) + { + write_stderr("parameter \"%s\" cannot be changed without restarting the server\n", + name); + } + else + { + ereport(elevel, + (0, + errmsg("parameter \"%s\" cannot be changed without restarting the server", + name))); + } + } + return false; + } + + if (changeVal) + { + /* Save old value to support transaction abort */ + if (!makeDefault) + push_old_value(&conf->gen); + + *conf->variable = newval; + set_extra_field(&conf->gen, &conf->gen.extra, + newextra); + conf->gen.source = source; + } + if (makeDefault) + { + GtmOptStack *stack; + + if (conf->gen.reset_source <= source) + { + conf->reset_val = newval; + set_extra_field(&conf->gen, &conf->reset_extra, + newextra); + conf->gen.reset_source = source; + } + for (stack = conf->gen.stack; stack; stack = stack->prev) + { + if (stack->source <= source) + { + stack->prior.val.realval = newval; + set_extra_field(&conf->gen, &stack->prior.extra, + newextra); + stack->source = source; + } + } + } + + /* Perhaps we didn't install newextra anywhere */ + if (newextra && !extra_field_used(&conf->gen, newextra)) + free(newextra); + break; + } + + case GTMC_STRING: + { + struct config_string *conf = (struct config_string *) record; + char *newval; + void *newextra = NULL; + + if (value) + { + /* + * The value passed by the caller could be transient, so + * we always strdup it. + */ + newval = gtm_opt_strdup(elevel, value); + if (newval == NULL) + return false; + } + else if (source == GTMC_S_DEFAULT) + { + /* non-NULL boot_val must always get strdup'd */ + if (conf->boot_val != NULL) + { + newval = gtm_opt_strdup(elevel, conf->boot_val); + if (newval == NULL) + return false; + } + else + newval = NULL; + + } + else + { + /* + * strdup not needed, since reset_val is already under + * guc.c's control + */ + newval = conf->reset_val; + newextra = conf->reset_extra; + source = conf->gen.reset_source; + } + + if (prohibitValueChange) + { + /* newval shouldn't be NULL, so we're a bit sloppy here */ + if (*conf->variable == NULL || newval == NULL || + strcmp(*conf->variable, newval) != 0) + { + if (isStartUp) + { + write_stderr("parameter \"%s\" cannot be changed without restarting the server\n", + name); + } + else + { + ereport(elevel, + (0, + errmsg("parameter \"%s\" cannot be changed without restarting the server", + name))); + } + } + return false; + } + + if (changeVal) + { + /* Save old value to support transaction abort */ + if (!makeDefault) + push_old_value(&conf->gen); + + set_string_field(conf, conf->variable, newval); + set_extra_field(&conf->gen, &conf->gen.extra, + newextra); + conf->gen.source = source; + } + + if (makeDefault) + { + GtmOptStack *stack; + + if (conf->gen.reset_source <= source) + { + set_string_field(conf, &conf->reset_val, newval); + set_extra_field(&conf->gen, &conf->reset_extra, + newextra); + conf->gen.reset_source = source; + } + for (stack = conf->gen.stack; stack; stack = stack->prev) + { + if (stack->source <= source) + { + set_string_field(conf, &stack->prior.val.stringval, + newval); + set_extra_field(&conf->gen, &stack->prior.extra, + newextra); + stack->source = source; + } + } + } + + /* Perhaps we didn't install newval anywhere */ + if (newval && !string_field_used(conf, newval)) + free(newval); + /* Perhaps we didn't install newextra anywhere */ + if (newextra && !extra_field_used(&conf->gen, newextra)) + free(newextra); + break; + } + + case GTMC_ENUM: + { + struct config_enum *conf = (struct config_enum *) record; + int newval; + void *newextra = NULL; + + if (value) + { + if (!config_enum_lookup_by_name(conf, value, &newval)) + { + char *hintmsg; + + hintmsg = config_enum_get_options(conf, + "Available values: ", + ".", ", "); + + if (isStartUp) + { + write_stderr("invalid value for parameter \"%s\": \"%s\". %s\n", + name, value, hintmsg); + } + else + { + ereport(elevel, + (0, + errmsg("invalid value for parameter \"%s\": \"%s\"", + name, value), + hintmsg ? errhint("%s", _(hintmsg)) : 0)); + } + + if (hintmsg) + free(hintmsg); + return false; + } + } + else if (source == GTMC_S_DEFAULT) + { + newval = conf->boot_val; + } + else + { + newval = conf->reset_val; + newextra = conf->reset_extra; + source = conf->gen.reset_source; + } + + if (prohibitValueChange) + { + if (*conf->variable != newval) + { + if (isStartUp) + { + write_stderr("parameter \"%s\" cannot be changed without restarting the server\n", + name); + } + else + { + ereport(elevel, + (0, + errmsg("parameter \"%s\" cannot be changed without restarting the server", + name))); + } + } + return false; + } + + if (changeVal) + { + /* Save old value to support transaction abort */ + if (!makeDefault) + push_old_value(&conf->gen); + + *conf->variable = newval; + set_extra_field(&conf->gen, &conf->gen.extra, + newextra); + conf->gen.source = source; + } + if (makeDefault) + { + GtmOptStack *stack; + + if (conf->gen.reset_source <= source) + { + conf->reset_val = newval; + set_extra_field(&conf->gen, &conf->reset_extra, + newextra); + conf->gen.reset_source = source; + } + for (stack = conf->gen.stack; stack; stack = stack->prev) + { + if (stack->source <= source) + { + stack->prior.val.enumval = newval; + set_extra_field(&conf->gen, &stack->prior.extra, + newextra); + stack->source = source; + } + } + } + + /* Perhaps we didn't install newextra anywhere */ + if (newextra && !extra_field_used(&conf->gen, newextra)) + free(newextra); + break; + } + } + + if (changeVal && (record->flags & GTMOPT_REPORT)) + ReportGTMOption(record); + + return true; +} + + + + +/* + * Set the fields for source file and line number the setting came from. + */ +static void +set_config_sourcefile(const char *name, char *sourcefile, int sourceline) +{ + struct config_generic *record; + int elevel; + + /* + * To avoid cluttering the log, only the postmaster bleats loudly about + * problems with the config file. + */ + elevel = DEBUG3; + + record = find_option(name, true, elevel); + /* should not happen */ + if (record == NULL) + { + if (isStartUp) + write_stderr("unrecognized configuration parameter \"%s\"\n", name); + else + elog(ERROR, "unrecognized configuration parameter \"%s\"", name); + } + + sourcefile = gtm_opt_strdup(elevel, sourcefile); + if (record->sourcefile) + free(record->sourcefile); + record->sourcefile = sourcefile; + record->sourceline = sourceline; +} + + +/* + * Set a config option to the given value. See also set_config_option, + * this is just the wrapper to be called from outside GTM. NB: this + * is used only for non-transactional operations. + * + * Note: there is no support here for setting source file/line, as it + * is currently not needed. + */ +void +SetConfigOption(const char *name, const char *value, + GtmOptContext context, GtmOptSource source) +{ + (void) set_config_option(name, value, context, source, + true); +} + + + + +/* + * Fetch the current value of the option `name'. If the option doesn't exist, + * throw an ereport and don't return. + * + * If restrict_superuser is true, we also enforce that only superusers can + * see GTMOPT_SUPERUSER_ONLY variables. This should only be passed as true + * in user-driven calls. + * + * The string is *not* allocated for modification and is really only + * valid until the next call to configuration related functions. + */ +const char * +GetConfigOption(const char *name, bool restrict_superuser) +{ + struct config_generic *record; + static char buffer[256]; + + record = find_option(name, false, ERROR); + if (record == NULL) + { + if (isStartUp) + write_stderr("unrecognized configuration parameter \"%s\"\n", name); + else + ereport(ERROR, + (0, + errmsg("unrecognized configuration parameter \"%s\"", name))); + } + switch (record->vartype) + { + case GTMC_BOOL: + return *((struct config_bool *) record)->variable ? "on" : "off"; + + case GTMC_INT: + snprintf(buffer, sizeof(buffer), "%d", + *((struct config_int *) record)->variable); + return buffer; + + case GTMC_REAL: + snprintf(buffer, sizeof(buffer), "%g", + *((struct config_real *) record)->variable); + return buffer; + + case GTMC_STRING: + return *((struct config_string *) record)->variable; + + case GTMC_ENUM: + return config_enum_lookup_by_value((struct config_enum *) record, + *((struct config_enum *) record)->variable); + } + return NULL; +} + + +/* + * Get the RESET value associated with the given option. + * + * Note: this is not re-entrant, due to use of static result buffer; + * not to mention that a string variable could have its reset_val changed. + * Beware of assuming the result value is good for very long. + */ +const char * +GetConfigOptionResetString(const char *name) +{ + struct config_generic *record; + static char buffer[256]; + + record = find_option(name, false, ERROR); + if (record == NULL) + { + if (isStartUp) + write_stderr("unrecognized configuration parameter \"%s\"\n", name); + else + ereport(ERROR, + (0, + errmsg("unrecognized configuration parameter \"%s\"", name))); + } + + switch (record->vartype) + { + case GTMC_BOOL: + return ((struct config_bool *) record)->reset_val ? "on" : "off"; + + case GTMC_INT: + snprintf(buffer, sizeof(buffer), "%d", + ((struct config_int *) record)->reset_val); + return buffer; + + case GTMC_REAL: + snprintf(buffer, sizeof(buffer), "%g", + ((struct config_real *) record)->reset_val); + return buffer; + + case GTMC_STRING: + return ((struct config_string *) record)->reset_val; + + case GTMC_ENUM: + return config_enum_lookup_by_value((struct config_enum *) record, + ((struct config_enum *) record)->reset_val); + } + return NULL; +} + + +void +EmitWarningsOnPlaceholders(const char *className) +{ + int classLen = strlen(className); + int i; + + for (i = 0; i < num_gtm_opt_variables; i++) + { + struct config_generic *var = gtm_opt_variables[i]; + + if ((var->flags & GTMOPT_CUSTOM_PLACEHOLDER) != 0 && + strncmp(className, var->name, classLen) == 0 && + var->name[classLen] == GTMOPT_QUALIFIER_SEPARATOR) + { + if (isStartUp) + write_stderr("unrecognized configuration parameter \"%s\"\n", + var->name); + else + ereport(WARNING, + (0, + errmsg("unrecognized configuration parameter \"%s\"", + var->name))); + } + } +} + + +/* + * Return GTM variable value by name; optionally return canonical + * form of name. Return value is malloc'd. + */ +char * +GetConfigOptionByName(const char *name, const char **varname) +{ + struct config_generic *record; + + record = find_option(name, false, ERROR); + if (record == NULL) + { + if (isStartUp) + write_stderr("unrecognized configuration parameter \"%s\"\n", name); + else + ereport(ERROR, + (0, + errmsg("unrecognized configuration parameter \"%s\"", name))); + } + if (varname) + *varname = record->name; + + return _ShowOption(record, true); +} + +/* + * Return GTM variable value by variable number; optionally return canonical + * form of name. Return value is malloc'd. + */ +void +GetConfigOptionByNum(int varnum, const char **values, bool *noshow) +{ + char buffer[256]; + struct config_generic *conf; + + /* check requested variable number valid */ + Assert((varnum >= 0) && (varnum < num_gtm_opt_variables)); + + conf = gtm_opt_variables[varnum]; + + if (noshow) + { + if (conf->flags & GTMOPT_NO_SHOW_ALL) + *noshow = true; + else + *noshow = false; + } + + /* first get the generic attributes */ + + /* name */ + values[0] = conf->name; + + /* setting : use _ShowOption in order to avoid duplicating the logic */ + values[1] = _ShowOption(conf, false); + + /* unit */ + if (conf->vartype == GTMC_INT) + { + static char buf[8]; + + switch (conf->flags & (GTMOPT_UNIT_MEMORY | GTMOPT_UNIT_TIME)) + { + case GTMOPT_UNIT_KB: + values[2] = "kB"; + break; + case GTMOPT_UNIT_BLOCKS: + snprintf(buf, sizeof(buf), "%dkB", BLCKSZ / 1024); + values[2] = buf; + break; + case GTMOPT_UNIT_XBLOCKS: + snprintf(buf, sizeof(buf), "%dkB", XLOG_BLCKSZ / 1024); + values[2] = buf; + break; + case GTMOPT_UNIT_MS: + values[2] = "ms"; + break; + case GTMOPT_UNIT_S: + values[2] = "s"; + break; + case GTMOPT_UNIT_MIN: + values[2] = "min"; + break; + default: + values[2] = ""; + break; + } + } + else + values[2] = NULL; + +#if 0 + /* PGXCTODO: Group parameters are not used yet */ + /* group */ + values[3] = config_group_names[conf->group]; +#endif + + /* short_desc */ + values[4] = conf->short_desc; + + /* extra_desc */ + values[5] = conf->long_desc; + + /* context */ + values[6] = GtmOptContext_Names[conf->context]; + + /* vartype */ + values[7] = config_type_names[conf->vartype]; + + /* source */ + values[8] = GtmOptSource_Names[conf->source]; + + /* now get the type specifc attributes */ + switch (conf->vartype) + { + case GTMC_BOOL: + { + struct config_bool *lconf = (struct config_bool *) conf; + + /* min_val */ + values[9] = NULL; + + /* max_val */ + values[10] = NULL; + + /* enumvals */ + values[11] = NULL; + + /* boot_val */ + values[12] = strdup(lconf->boot_val ? "on" : "off"); + + /* reset_val */ + values[13] = strdup(lconf->reset_val ? "on" : "off"); + } + break; + + case GTMC_INT: + { + struct config_int *lconf = (struct config_int *) conf; + + /* min_val */ + snprintf(buffer, sizeof(buffer), "%d", lconf->min); + values[9] = strdup(buffer); + + /* max_val */ + snprintf(buffer, sizeof(buffer), "%d", lconf->max); + values[10] = strdup(buffer); + + /* enumvals */ + values[11] = NULL; + + /* boot_val */ + snprintf(buffer, sizeof(buffer), "%d", lconf->boot_val); + values[12] = strdup(buffer); + + /* reset_val */ + snprintf(buffer, sizeof(buffer), "%d", lconf->reset_val); + values[13] = strdup(buffer); + } + break; + + case GTMC_REAL: + { + struct config_real *lconf = (struct config_real *) conf; + + /* min_val */ + snprintf(buffer, sizeof(buffer), "%g", lconf->min); + values[9] = strdup(buffer); + + /* max_val */ + snprintf(buffer, sizeof(buffer), "%g", lconf->max); + values[10] = strdup(buffer); + + /* enumvals */ + values[11] = NULL; + + /* boot_val */ + snprintf(buffer, sizeof(buffer), "%g", lconf->boot_val); + values[12] = strdup(buffer); + + /* reset_val */ + snprintf(buffer, sizeof(buffer), "%g", lconf->reset_val); + values[13] = strdup(buffer); + } + break; + + case GTMC_STRING: + { + struct config_string *lconf = (struct config_string *) conf; + + /* min_val */ + values[9] = NULL; + + /* max_val */ + values[10] = NULL; + + /* enumvals */ + values[11] = NULL; + + /* boot_val */ + if (lconf->boot_val == NULL) + values[12] = NULL; + else + values[12] = strdup(lconf->boot_val); + + /* reset_val */ + if (lconf->reset_val == NULL) + values[13] = NULL; + else + values[13] = strdup(lconf->reset_val); + } + break; + + case GTMC_ENUM: + { + struct config_enum *lconf = (struct config_enum *) conf; + + /* min_val */ + values[9] = NULL; + + /* max_val */ + values[10] = NULL; + + /* enumvals */ + + /* + * NOTE! enumvals with double quotes in them are not + * supported! + */ + values[11] = config_enum_get_options((struct config_enum *) conf, + "{\"", "\"}", "\",\""); + + /* boot_val */ + values[12] = strdup(config_enum_lookup_by_value(lconf, + lconf->boot_val)); + + /* reset_val */ + values[13] = strdup(config_enum_lookup_by_value(lconf, + lconf->reset_val)); + } + break; + + default: + { + /* + * should never get here, but in case we do, set 'em to NULL + */ + + /* min_val */ + values[9] = NULL; + + /* max_val */ + values[10] = NULL; + + /* enumvals */ + values[11] = NULL; + + /* boot_val */ + values[12] = NULL; + + /* reset_val */ + values[13] = NULL; + } + break; + } + + /* + * If the setting came from a config file, set the source location. For + * security reasons, we don't show source file/line number for + * non-superusers. + */ + if (conf->source == GTMC_S_FILE) + { + values[14] = conf->sourcefile; + snprintf(buffer, sizeof(buffer), "%d", conf->sourceline); + values[15] = strdup(buffer); + } + else + { + values[14] = NULL; + values[15] = NULL; + } +} + +/* + * Return the total number of GTM variables + */ +int +GetNumConfigOptions(void) +{ + return num_gtm_opt_variables; +} + + +static char * +_ShowOption(struct config_generic * record, bool use_units) +{ + char buffer[256]; + const char *val; + + switch (record->vartype) + { + case GTMC_BOOL: + { + struct config_bool *conf = (struct config_bool *) record; + + val = *conf->variable ? "on" : "off"; + } + break; + + case GTMC_INT: + { + struct config_int *conf = (struct config_int *) record; + + /* + * Use int64 arithmetic to avoid overflows in units + * conversion. + */ + int64 result = *conf->variable; + const char *unit; + + if (use_units && result > 0 && + (record->flags & GTMOPT_UNIT_MEMORY)) + { + switch (record->flags & GTMOPT_UNIT_MEMORY) + { + case GTMOPT_UNIT_BLOCKS: + result *= BLCKSZ / 1024; + break; + case GTMOPT_UNIT_XBLOCKS: + result *= XLOG_BLCKSZ / 1024; + break; + } + + if (result % KB_PER_GB == 0) + { + result /= KB_PER_GB; + unit = "GB"; + } + else if (result % KB_PER_MB == 0) + { + result /= KB_PER_MB; + unit = "MB"; + } + else + { + unit = "kB"; + } + } + else if (use_units && result > 0 && + (record->flags & GTMOPT_UNIT_TIME)) + { + switch (record->flags & GTMOPT_UNIT_TIME) + { + case GTMOPT_UNIT_S: + result *= MS_PER_S; + break; + case GTMOPT_UNIT_MIN: + result *= MS_PER_MIN; + break; + } + + if (result % MS_PER_D == 0) + { + result /= MS_PER_D; + unit = "d"; + } + else if (result % MS_PER_H == 0) + { + result /= MS_PER_H; + unit = "h"; + } + else if (result % MS_PER_MIN == 0) + { + result /= MS_PER_MIN; + unit = "min"; + } + else if (result % MS_PER_S == 0) + { + result /= MS_PER_S; + unit = "s"; + } + else + { + unit = "ms"; + } + } + else + unit = ""; + + snprintf(buffer, sizeof(buffer), INT64_FORMAT "%s", + result, unit); + val = buffer; + + } + break; + + case GTMC_REAL: + { + struct config_real *conf = (struct config_real *) record; + + snprintf(buffer, sizeof(buffer), "%g", + *conf->variable); + val = buffer; + } + break; + + case GTMC_STRING: + { + struct config_string *conf = (struct config_string *) record; + + if (*conf->variable && **conf->variable) + val = *conf->variable; + else + val = ""; + } + break; + + case GTMC_ENUM: + { + struct config_enum *conf = (struct config_enum *) record; + + val = config_enum_lookup_by_value(conf, *conf->variable); + } + break; + + default: + /* just to keep compiler quiet */ + val = "???"; + break; + } + + return strdup(val); +} + + + +/* + * A little "long argument" simulation, although not quite GNU + * compliant. Takes a string of the form "some-option=some value" and + * returns name = "some_option" and value = "some value" in malloc'ed + * storage. Note that '-' is converted to '_' in the option name. If + * there is no '=' in the input string then value will be NULL. + */ +void +ParseLongOption(const char *string, char **name, char **value) +{ + size_t equal_pos; + char *cp; + + AssertArg(string); + AssertArg(name); + AssertArg(value); + + equal_pos = strcspn(string, "="); + + if (string[equal_pos] == '=') + { + *name = gtm_opt_malloc(FATAL, equal_pos + 1); + strlcpy(*name, string, equal_pos + 1); + + *value = gtm_opt_strdup(FATAL, &string[equal_pos + 1]); + } + else + { + /* no equal sign in string */ + *name = gtm_opt_strdup(FATAL, string); + *value = NULL; + } + + for (cp = *name; *cp; cp++) + if (*cp == '-') + *cp = '_'; +} + +#if 0 +/* + * keep-alive related APIs will be used in future extensions + */ +void +gtm_assign_tcp_keepalives_idle(int newval, void *extra) +{ + /* + * The kernel API provides no way to test a value without setting it; and + * once we set it we might fail to unset it. So there seems little point + * in fully implementing the check-then-assign GTM API for these + * variables. Instead we just do the assignment on demand. pqcomm.c + * reports any problems via elog(LOG). + * + * This approach means that the GTM value might have little to do with the + * actual kernel value, so we use a show_hook that retrieves the kernel + * value rather than trusting GTM's copy. + */ +#if 0 + (void) pq_setkeepalivesidle(newval, MyProcPort); +#else + (void) pq_setkeepalivesidle_all(newval); +#endif +} + +const char * +gtm_show_tcp_keepalives_idle(void) +{ + /* See comments in assign_tcp_keepalives_idle */ + static char nbuf[16]; + +#if 0 + snprintf(nbuf, sizeof(nbuf), "%d", pq_getkeepalivesidle(MyProcPort)); +#else + snprintf(nbuf, sizeof(nbuf), "%d", pq_getkeepalivesidle_all()); +#endif + return nbuf; +} + +void +gtm_assign_tcp_keepalives_interval(int newval, void *extra) +{ + /* See comments in assign_tcp_keepalives_idle */ +#if 0 + (void) pq_setkeepalivesinterval(newval, MyProcPort); +#else + (void) pq_setkeepalivesinterval_all(newval); +#endif +} + +const char * +gtm_show_tcp_keepalives_interval(void) +{ + /* See comments in assign_tcp_keepalives_idle */ + static char nbuf[16]; + +#if 0 + snprintf(nbuf, sizeof(nbuf), "%d", pq_getkeepalivesinterval(MyProcPort)); +#else + snprintf(nbuf, sizeof(nbuf), "%d", pq_getkeepalivesinterval_all()); +#endif + return nbuf; +} + +void +gtm_assign_tcp_keepalives_count(int newval, void *extra) +{ + /* See comments in assign_tcp_keepalives_idle */ +#if 0 + (void) pq_setkeepalivescount(newval, MyProcPort); +#else + (void) pq_setkeepalivescount_all(newval); +#endif +} + +const char * +gtm_show_tcp_keepalives_count(void) +{ + /* See comments in assign_tcp_keepalives_idle */ + static char nbuf[16]; + +#if 0 + snprintf(nbuf, sizeof(nbuf), "%d", pq_getkeepalivescount(MyProcPort)); +#else + snprintf(nbuf, sizeof(nbuf), "%d", pq_getkeepalivescount_all()); +#endif + return nbuf; +} +#endif + +/* + * Try to interpret value as boolean value. Valid values are: true, + * false, yes, no, on, off, 1, 0; as well as unique prefixes thereof. + * If the string parses okay, return true, else false. + * If okay and result is not NULL, return the value in *result. + */ +static bool +gtm_opt_parse_bool(const char *value, bool *result) +{ + return gtm_opt_parse_bool_with_len(value, strlen(value), result); +} + +static bool +gtm_opt_parse_bool_with_len(const char *value, size_t len, bool *result) +{ + switch (*value) + { + case 't': + case 'T': + if (pg_strncasecmp(value, "true", len) == 0) + { + if (result) + *result = true; + return true; + } + break; + case 'f': + case 'F': + if (pg_strncasecmp(value, "false", len) == 0) + { + if (result) + *result = false; + return true; + } + break; + case 'y': + case 'Y': + if (pg_strncasecmp(value, "yes", len) == 0) + { + if (result) + *result = true; + return true; + } + break; + case 'n': + case 'N': + if (pg_strncasecmp(value, "no", len) == 0) + { + if (result) + *result = false; + return true; + } + break; + case 'o': + case 'O': + /* 'o' is not unique enough */ + if (pg_strncasecmp(value, "on", (len > 2 ? len : 2)) == 0) + { + if (result) + *result = true; + return true; + } + else if (pg_strncasecmp(value, "off", (len > 2 ? len : 2)) == 0) + { + if (result) + *result = false; + return true; + } + break; + case '1': + if (len == 1) + { + if (result) + *result = true; + return true; + } + break; + case '0': + if (len == 1) + { + if (result) + *result = false; + return true; + } + break; + default: + break; + } + + if (result) + *result = false; /* suppress compiler warning */ + return false; +} + +/* + * ReportGUCOption: if appropriate, transmit option value to frontend + */ +static void +ReportGTMOption(struct config_generic * record) +{ + /* So far, it is empty. */ +} + +/* + * Lookup the name for an enum option with the selected value. + * Should only ever be called with known-valid values, so throws + * an elog(ERROR) if the enum option is not found. + * + * The returned string is a pointer to static data and not + * allocated for modification. + */ +const char * +config_enum_lookup_by_value(struct config_enum * record, int val) +{ + const struct config_enum_entry *entry; + + for (entry = record->options; entry && entry->name; entry++) + { + if (entry->val == val) + return entry->name; + } + + if (isStartUp) + write_stderr("could not find enum option %d for %s\n", + val, record->gen.name); + else + elog(ERROR, "could not find enum option %d for %s", + val, record->gen.name); + return NULL; /* silence compiler */ +} diff --git a/src/gtm/common/gtm_opt_scanner.l b/src/gtm/common/gtm_opt_scanner.l new file mode 100644 index 0000000000..f9be2cbfbe --- /dev/null +++ b/src/gtm/common/gtm_opt_scanner.l @@ -0,0 +1,92 @@ +/* -*-pgsql-c-*- */ +/* + * Scanner for the configuration file + * + * Copyright (c) 2000-2011, PostgreSQL Global Development Group + * + * src/backend/utils/misc/guc-file.l + */ + +%{ + +#include "gtm/gtm.h" + +#include <ctype.h> +#include <unistd.h> +#include <stdlib.h> + +#include "mb/pg_wchar.h" +#include "gtm/assert.h" +#include "gtm/gtm_opt.h" +#include "gtm/elog.h" + + +/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */ +#undef fprintf +#define fprintf(file, fmt, msg) ereport(ERROR, (errmsg_internal("%s", msg))) + +enum { + GTMOPT_ID = 1, + GTMOPT_STRING = 2, + GTMOPT_INTEGER = 3, + GTMOPT_REAL = 4, + GTMOPT_EQUALS = 5, + GTMOPT_UNQUOTED_STRING = 6, + GTMOPT_QUALIFIED_ID = 7, + GTMOPT_EOL = 99, + GTMOPT_ERROR = 100 +}; + +static unsigned int ConfigFileLineno; + +/* flex fails to supply a prototype for yylex, so provide one */ +int GTMOPT_yylex(void); + +%} + +%option 8bit +%option never-interactive +%option nodefault +%option noinput +%option nounput +%option noyywrap +%option prefix="GTMOPT_yy" + + +SIGN ("-"|"+") +DIGIT [0-9] +HEXDIGIT [0-9a-fA-F] + +UNIT_LETTER [a-zA-Z] + +INTEGER {SIGN}?({DIGIT}+|0x{HEXDIGIT}+){UNIT_LETTER}* + +EXPONENT [Ee]{SIGN}?{DIGIT}+ +REAL {SIGN}?{DIGIT}*"."{DIGIT}*{EXPONENT}? + +LETTER [A-Za-z_\200-\377] +LETTER_OR_DIGIT [A-Za-z_0-9\200-\377] + +ID {LETTER}{LETTER_OR_DIGIT}* +QUALIFIED_ID {ID}"."{ID} + +UNQUOTED_STRING {LETTER}({LETTER_OR_DIGIT}|[-._:/])* +STRING \'([^'\\\n]|\\.|\'\')*\' + +%% + +\n ConfigFileLineno++; return GTMOPT_EOL; +[ \t\r]+ /* eat whitespace */ +#.* /* eat comment (.* matches anything until newline) */ + +{ID} return GTMOPT_ID; +{QUALIFIED_ID} return GTMOPT_QUALIFIED_ID; +{STRING} return GTMOPT_STRING; +{UNQUOTED_STRING} return GTMOPT_UNQUOTED_STRING; +{INTEGER} return GTMOPT_INTEGER; +{REAL} return GTMOPT_REAL; += return GTMOPT_EQUALS; + +. return GTMOPT_ERROR; + +%% diff --git a/src/gtm/common/gtm_serialize.c b/src/gtm/common/gtm_serialize.c index bb8e368d7c..9b870957b9 100644 --- a/src/gtm/common/gtm_serialize.c +++ b/src/gtm/common/gtm_serialize.c @@ -3,6 +3,11 @@ * gtm_serialize.c * Serialization management of GTM data * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -701,6 +706,13 @@ gtm_get_pgxcnodeinfo_size(GTM_PGXCNodeInfo *data) len += sizeof(GTM_PGXCNodeStatus); /* status */ +#ifdef XCP + len += sizeof(uint32); /* max_sessions */ + len += sizeof(uint32); /* num_sessions */ + if (data->num_sessions > 0) /* sessions */ + len += (data->num_sessions * sizeof(GTM_PGXCSession)); +#endif + return len; } @@ -787,6 +799,21 @@ gtm_serialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, char *buf, size_t buflen) memcpy(buf + len, &(data->status), sizeof(GTM_PGXCNodeStatus)); len += sizeof(GTM_PGXCNodeStatus); +#ifdef XCP + /* GTM_PGXCNodeInfo.sessions */ + len_wk = data->max_sessions; + memcpy(buf + len, &len_wk, sizeof(uint32)); + len += sizeof(uint32); + len_wk = data->num_sessions; + memcpy(buf + len, &len_wk, sizeof(uint32)); + len += sizeof(uint32); + if (len_wk > 0) + { + memcpy(buf + len, data->sessions, len_wk * sizeof(GTM_PGXCSession)); + len += len_wk * sizeof(GTM_PGXCSession); + } +#endif + /* NOTE: nothing to be done for node_lock */ return len; } @@ -795,25 +822,46 @@ gtm_serialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, char *buf, size_t buflen) /* * Return a deserialize number of PGXC node information */ +#ifdef XCP +size_t +gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buflen, PQExpBuffer *errorbuf) +#else size_t gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buflen) +#endif { size_t len = 0; uint32 len_wk; /* GTM_PGXCNodeInfo.type */ +#ifdef XCP + if (len + sizeof(GTM_PGXCNodeType) > buflen) + { + printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of node info. buflen = %d", (int) buflen); + return (size_t) 0; + } +#endif memcpy(&(data->type), buf + len, sizeof(GTM_PGXCNodeType)); len += sizeof(GTM_PGXCNodeType); /* GTM_PGXCNodeInfo.nodename*/ memcpy(&len_wk, buf + len, sizeof(uint32)); len += sizeof(uint32); + if (len_wk == 0) { data->nodename = NULL; } else { +#ifdef XCP + if (len + len_wk > buflen) + { + printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of node name"); + return (size_t) 0; + } +#endif + /* PGXCTODO: free memory */ data->nodename = (char *)genAlloc(len_wk + 1); memcpy(data->nodename, buf + len, (size_t)len_wk); @@ -821,6 +869,7 @@ gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buf len += len_wk; } + /* GTM_PGXCNodeInfo.proxyname*/ memcpy(&len_wk, buf + len, sizeof(uint32)); len += sizeof(uint32); @@ -830,6 +879,13 @@ gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buf } else { +#ifdef XCP + if (len + len_wk > buflen) + { + printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of node info after proxy name"); + return (size_t) 0; + } +#endif /* PGXCTODO: free memory */ data->proxyname = (char *)genAlloc(len_wk + 1); memcpy(data->proxyname, buf + len, (size_t)len_wk); @@ -838,6 +894,13 @@ gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buf } /* GTM_PGXCNodeInfo.port */ +#ifdef XCP + if (len + sizeof(GTM_PGXCNodePort) > buflen) + { + printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of node port"); + return (size_t) 0; + } +#endif memcpy(&(data->port), buf + len, sizeof(GTM_PGXCNodePort)); len += sizeof(GTM_PGXCNodePort); @@ -850,6 +913,13 @@ gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buf } else { +#ifdef XCP + if (len + len_wk > buflen) + { + printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of ipaddress"); + return (size_t) 0; + } +#endif data->ipaddress = (char *)genAlloc(len_wk + 1); memcpy(data->ipaddress, buf + len, (size_t)len_wk); data->ipaddress[len_wk] = 0; /* null_terminate */ @@ -865,6 +935,13 @@ gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buf } else { +#ifdef XCP + if (len + len_wk > buflen) + { + printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of node info after data folder"); + return (size_t) 0; + } +#endif data->datafolder = (char *)genAlloc(len_wk + 1); memcpy(data->datafolder, buf + len, (size_t)len_wk); data->datafolder[len_wk] = 0; /* null_terminate */ @@ -872,9 +949,39 @@ gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buf } /* GTM_PGXCNodeInfo.status */ +#ifdef XCP + if (len + sizeof(GTM_PGXCNodeStatus) > buflen) + { + printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of node info after status"); + return (size_t) 0; + } +#endif memcpy(&(data->status), buf + len, sizeof(GTM_PGXCNodeStatus)); len += sizeof(GTM_PGXCNodeStatus); +#ifdef XCP + /* GTM_PGXCNodeInfo.sessions */ + memcpy(&len_wk, buf + len, sizeof(uint32)); + len += sizeof(uint32); + data->max_sessions = len_wk; + if (len_wk > 0) + data->sessions = (GTM_PGXCSession *) + genAlloc(len_wk * sizeof(GTM_PGXCSession)); + memcpy(&len_wk, buf + len, sizeof(uint32)); + len += sizeof(uint32); + data->num_sessions = len_wk; + if (len_wk > 0) + { + if (len + (data->num_sessions * sizeof(GTM_PGXCSession)) > buflen) + { + printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of session info"); + return (size_t) 0; + } + memcpy(data->sessions, buf + len, len_wk * sizeof(GTM_PGXCSession)); + len += len_wk * sizeof(GTM_PGXCSession); + } +#endif + /* NOTE: nothing to be done for node_lock */ return len; @@ -894,7 +1001,13 @@ gtm_get_sequence_size(GTM_SeqInfo *seq) len += sizeof(GTM_SequenceKeyType); /* gs_key.gsk_type */ len += sizeof(GTM_Sequence); /* gs_value */ len += sizeof(GTM_Sequence); /* gs_init_value */ +#ifdef XCP + len += sizeof(uint32); /* gs_max_lastvals */ + len += sizeof(uint32); /* gs_lastval_count */ + len += seq->gs_lastval_count * sizeof(GTM_SeqLastVal); /* gs_last_values */ +#else len += sizeof(GTM_Sequence); /* gs_last_value */ +#endif len += sizeof(GTM_Sequence); /* gs_increment_by */ len += sizeof(GTM_Sequence); /* gs_min_value */ len += sizeof(GTM_Sequence); /* gs_max_value */ @@ -935,8 +1048,18 @@ gtm_serialize_sequence(GTM_SeqInfo *s, char *buf, size_t buflen) memcpy(buf + len, &s->gs_init_value, sizeof(GTM_Sequence)); len += sizeof(GTM_Sequence); /* gs_init_value */ +#ifdef XCP + memcpy(buf + len, &s->gs_max_lastvals, sizeof(uint32)); + len += sizeof(uint32); /* gs_max_lastvals */ + memcpy(buf + len, &s->gs_lastval_count, sizeof(uint32)); + len += sizeof(uint32); /* gs_lastval_count */ + memcpy(buf + len, s->gs_last_values, + s->gs_lastval_count * sizeof(GTM_SeqLastVal)); + len += s->gs_lastval_count * sizeof(GTM_SeqLastVal); /* gs_last_values */ +#else memcpy(buf + len, &s->gs_last_value, sizeof(GTM_Sequence)); len += sizeof(GTM_Sequence); /* gs_last_value */ +#endif memcpy(buf + len, &s->gs_increment_by, sizeof(GTM_Sequence)); len += sizeof(GTM_Sequence); /* gs_increment_by */ @@ -965,13 +1088,11 @@ gtm_serialize_sequence(GTM_SeqInfo *s, char *buf, size_t buflen) /* * Return number of deserialized sequence information */ -GTM_SeqInfo * -gtm_deserialize_sequence(const char *buf, size_t buflen) +size_t +gtm_deserialize_sequence(GTM_SeqInfo *seq, const char *buf, size_t buflen) { size_t len = 0; - GTM_SeqInfo *seq; - seq = (GTM_SeqInfo *)genAlloc0(sizeof(GTM_SeqInfo)); seq->gs_key = (GTM_SequenceKeyData *)genAlloc0(sizeof(GTM_SequenceKeyData)); memcpy(&seq->gs_key->gsk_keylen, buf + len, sizeof(uint32)); @@ -990,8 +1111,24 @@ gtm_deserialize_sequence(const char *buf, size_t buflen) memcpy(&seq->gs_init_value, buf + len, sizeof(GTM_Sequence)); len += sizeof(GTM_Sequence); /* gs_init_value */ +#ifdef XCP + memcpy(&seq->gs_max_lastvals, buf + len, sizeof(uint32)); + len += sizeof(uint32); /* gs_max_lastvals */ + if (seq->gs_max_lastvals > 0) + seq->gs_last_values = (GTM_SeqLastVal *) + genAlloc(seq->gs_max_lastvals * sizeof(GTM_SeqLastVal)); + memcpy(&seq->gs_lastval_count, buf + len, sizeof(uint32)); + len += sizeof(uint32); /* gs_lastval_count */ + if (seq->gs_lastval_count > 0) + { + memcpy(seq->gs_last_values, buf + len, + seq->gs_lastval_count * sizeof(GTM_SeqLastVal)); + len += seq->gs_lastval_count * sizeof(GTM_SeqLastVal); /* gs_last_values */ + } +#else memcpy(&seq->gs_last_value, buf + len, sizeof(GTM_Sequence)); len += sizeof(GTM_Sequence); /* gs_last_value */ +#endif memcpy(&seq->gs_increment_by, buf + len, sizeof(GTM_Sequence)); len += sizeof(GTM_Sequence); /* gs_increment_by */ @@ -1014,5 +1151,5 @@ gtm_deserialize_sequence(const char *buf, size_t buflen) memcpy(&seq->gs_state, buf + len, sizeof(uint32)); len += sizeof(uint32); - return seq; + return len; } diff --git a/src/gtm/common/gtm_utils.c b/src/gtm/common/gtm_utils.c index ea6988640d..081660ec57 100644 --- a/src/gtm/common/gtm_utils.c +++ b/src/gtm/common/gtm_utils.c @@ -71,6 +71,7 @@ static struct enum_name message_name_tab[] = {MSG_SNAPSHOT_GXID_GET, "MSG_SNAPSHOT_GXID_GET"}, {MSG_SEQUENCE_INIT, "MSG_SEQUENCE_INIT"}, {MSG_BKUP_SEQUENCE_INIT, "MSG_BKUP_SEQUENCE_INIT"}, + {MSG_SEQUENCE_GET_CURRENT, "MSG_SEQUENCE_GET_CURRENT"}, {MSG_SEQUENCE_GET_NEXT, "MSG_SEQUENCE_GET_NEXT"}, {MSG_BKUP_SEQUENCE_GET_NEXT, "MSG_BKUP_SEQUENCE_GET_NEXT"}, {MSG_SEQUENCE_GET_LAST, "MSG_SEQUENCE_GET_LAST"}, @@ -124,6 +125,7 @@ static struct enum_name result_name_tab[] = {SNAPSHOT_GET_MULTI_RESULT, "SNAPSHOT_GET_MULTI_RESULT"}, {SNAPSHOT_GXID_GET_RESULT, "SNAPSHOT_GXID_GET_RESULT"}, {SEQUENCE_INIT_RESULT, "SEQUENCE_INIT_RESULT"}, + {SEQUENCE_GET_CURRENT_RESULT, "SEQUENCE_GET_CURRENT_RESULT"}, {SEQUENCE_GET_NEXT_RESULT, "SEQUENCE_GET_NEXT_RESULT"}, {SEQUENCE_GET_LAST_RESULT, "SEQUENCE_GET_LAST_RESULT"}, {SEQUENCE_SET_VAL_RESULT, "SEQUENCE_SET_VAL_RESULT"}, diff --git a/src/gtm/gtm_ctl/.gitignore b/src/gtm/gtm_ctl/.gitignore new file mode 100644 index 0000000000..ffe90d63fc --- /dev/null +++ b/src/gtm/gtm_ctl/.gitignore @@ -0,0 +1 @@ +/gtm_ctl diff --git a/src/gtm/gtm_ctl/Makefile b/src/gtm/gtm_ctl/Makefile new file mode 100644 index 0000000000..6b079b7832 --- /dev/null +++ b/src/gtm/gtm_ctl/Makefile @@ -0,0 +1,34 @@ +#---------------------------------------------------------------------------- +# +# Postgres-XC GTM gtm_ctl makefile +# +# Copyright(c) 2010-2012 Postgres-XC Development Group +# +# src/gtm/gtm_ctl/Makefile +# +#----------------------------------------------------------------------------- +top_builddir=../../.. +include $(top_builddir)/src/Makefile.global +subdir=src/gtm/gtm_ctl + +OBJS=gtm_ctl.o + +OTHERS=../common/libgtm.a ../libpq/libpqcomm.a ../client/libgtmclient.a ../path/libgtmpath.a + +LDFLAGS=-L$(top_builddir)/common -L$(top_builddir)/libpq + + +LIBS=-lpthread + +gtm_ctl:$(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ $(OTHERS) -o gtm_ctl + +all:gtm_ctl + +clean: + rm -f $(OBJS) + rm -f gtm_ctl + +distclean: clean + +maintainer-clean: distclean diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c new file mode 100644 index 0000000000..29c78c8d00 --- /dev/null +++ b/src/gtm/gtm_ctl/gtm_ctl.c @@ -0,0 +1,1317 @@ +/*------------------------------------------------------------------------- + * + * gtm_ctl --- start/stops/restarts the GTM server/proxy + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + +#include "gtm/gtm_c.h" +#include "gtm/libpq-fe.h" + +#include <locale.h> +#include <signal.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#ifdef HAVE_SYS_RESOURCE_H +#include <sys/time.h> +#include <sys/resource.h> +#endif + +#include "libpq/pqsignal.h" + +/* PID can be negative for standalone backend */ +typedef long pgpid_t; + +typedef enum +{ + SMART_MODE, + FAST_MODE, + IMMEDIATE_MODE +} ShutdownMode; + + +typedef enum +{ + NO_COMMAND = 0, + START_COMMAND, + STOP_COMMAND, + PROMOTE_COMMAND, + RESTART_COMMAND, + STATUS_COMMAND, + RECONNECT_COMMAND +} CtlCommand; + +#define DEFAULT_WAIT 60 + +static bool do_wait = false; +static bool wait_set = false; +static int wait_seconds = DEFAULT_WAIT; +static bool silent_mode = false; +static ShutdownMode shutdown_mode = SMART_MODE; +static int sig = SIGTERM; /* default */ +static CtlCommand ctl_command = NO_COMMAND; +static char *gtm_data = NULL; +static char *gtmdata_opt = NULL; +static char *gtm_opts = NULL; +static const char *progname; +static char *log_file = NULL; +static char *gtm_path = NULL; +static char *gtm_app = NULL; +static char *argv0 = NULL; + +static void +write_stderr(const char *fmt,...) +/* This extension allows gcc to check the format string for consistency with + the supplied arguments. */ +__attribute__((format(printf, 1, 2))); +static void *pg_malloc(size_t size); +static char *xstrdup(const char *s); +static void do_advice(void); +static void do_help(void); +static void set_mode(char *modeopt); +static void do_start(void); +static void do_stop(void); +static void do_restart(void); +static void do_reconnect(void); +static void print_msg(const char *msg); + +static pgpid_t get_pgpid(void); +static char **readfile(const char *path); +static int start_gtm(void); +static void read_gtm_opts(void); + +static bool test_gtm_connection(); +static bool gtm_is_alive(pid_t pid); + +static void *pg_realloc(void *ptr, size_t size); +static int RunAsDaemon(char *cmd); + +static char gtmopts_file[MAXPGPATH]; +static char pid_file[MAXPGPATH]; +static char conf_file[MAXPGPATH]; + +/* + * Write errors to stderr (or by gtm_equal means when stderr is + * not available). + */ +static void +write_stderr(const char *fmt,...) +{ + va_list ap; + + va_start(ap, fmt); + + /* On Unix, we just fprintf to stderr */ + vfprintf(stderr, fmt, ap); + va_end(ap); +} + +/* + * routines to check memory allocations and fail noisily. + */ + +static void * +pg_malloc(size_t size) +{ + void *result; + + result = malloc(size); + if (!result) + { + write_stderr(_("%s: out of memory\n"), progname); + exit(1); + } + return result; +} + + +static char * +xstrdup(const char *s) +{ + char *result; + + result = strdup(s); + if (!result) + { + write_stderr(_("%s: out of memory\n"), progname); + exit(1); + } + return result; +} + +/* + * Given an already-localized string, print it to stdout unless the + * user has specified that no messages should be printed. + */ +static void +print_msg(const char *msg) +{ + if (!silent_mode) + { + fputs(msg, stdout); + fflush(stdout); + } +} + +static pgpid_t +get_pgpid(void) +{ + FILE *pidf; + long pid; + + pidf = fopen(pid_file, "r"); + if (pidf == NULL) + { + /* No pid file, not an error on startup */ + if (errno == ENOENT) + return 0; + else + { + write_stderr(_("%s: could not open PID file \"%s\": %s\n"), + progname, pid_file, strerror(errno)); + exit(1); + } + } + if (fscanf(pidf, "%ld", &pid) != 1) + { + write_stderr(_("%s: invalid data in PID file \"%s\"\n"), + progname, pid_file); + exit(1); + } + fclose(pidf); + return (pgpid_t) pid; +} + + +/* + * get the lines from a text file - return NULL if file can't be opened + */ +static char ** +readfile(const char *path) +{ + FILE *infile; + int maxlength = 0, + linelen = 0; + int nlines = 0; + char **result; + char *buffer; + int c; + + if ((infile = fopen(path, "r")) == NULL) + return NULL; + + /* pass over the file twice - the first time to size the result */ + + while ((c = fgetc(infile)) != EOF) + { + linelen++; + if (c == '\n') + { + nlines++; + if (linelen > maxlength) + maxlength = linelen; + linelen = 0; + } + } + + /* handle last line without a terminating newline (yuck) */ + if (linelen) + nlines++; + if (linelen > maxlength) + maxlength = linelen; + + /* set up the result and the line buffer */ + result = (char **) pg_malloc((nlines + 1) * sizeof(char *)); + buffer = (char *) pg_malloc(maxlength + 1); + + /* now reprocess the file and store the lines */ + rewind(infile); + nlines = 0; + while (fgets(buffer, maxlength + 1, infile) != NULL) + result[nlines++] = xstrdup(buffer); + + fclose(infile); + free(buffer); + result[nlines] = NULL; + + return result; +} + + + +/* + * start/test/stop routines + */ + +static int +start_gtm(void) +{ + char cmd[MAXPGPATH]; + char gtm_app_path[MAXPGPATH]; + int len; + + /* + * Since there might be quotes to handle here, it is easier simply to pass + * everything to a shell to process them. + */ + + memset(gtm_app_path, 0, MAXPGPATH); + memset(cmd, 0, MAXPGPATH); + + /* + * Build gtm binary path. We should leave one byte at the end for '\0' + */ + len = 0; + if (gtm_path != NULL) + { + strncpy(gtm_app_path, gtm_path, MAXPGPATH - len - 1); + + len = strlen(gtm_app_path); + strncat(gtm_app_path, "/", MAXPGPATH - len - 1); + + len = strlen(gtm_app_path); + } + + if (strlen(gtm_app) >= (MAXPGPATH - len - 1)) + { + write_stderr("gtm command exceeds max size"); + exit(1); + } + + strncat(gtm_app_path, gtm_app, MAXPGPATH - len - 1); + + if (log_file != NULL) + len = snprintf(cmd, MAXPGPATH - 1, SYSTEMQUOTE "\"%s\" %s%s -l %s &" SYSTEMQUOTE, + gtm_app_path, gtmdata_opt, gtm_opts, log_file); + else + len = snprintf(cmd, MAXPGPATH - 1, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" 2>&1 &" SYSTEMQUOTE, + gtm_app_path, gtmdata_opt, gtm_opts, DEVNULL); + + if (len >= MAXPGPATH - 1) + { + write_stderr("gtm command exceeds max size"); + exit(1); + } + + if (log_file) + return (RunAsDaemon(cmd)); + else + return system(cmd); +} + +/* + * Run specified command as a daemon. + * Assume that *cmd includes '&' to run + * the command at background so that we need fork() + * only once. + */ +static int RunAsDaemon(char *cmd) +{ + switch (fork()) + { + int status; + + case 0: + /* + * Using fileno(xxx) may encounter trivial error because xxx may + * have been closed at somewhere else and fileno() may fail. + * Its safer to use literal file descriptor here. + */ + close(0); + close(1); + close(2); + if ((status = system(cmd)) == -1) + /* + * Same behavior as /bin/sh could not be + * executed. + */ + exit(127); + else + exit(WEXITSTATUS(status)); + break; + case -1: + return -1; + default: + return 0; + break; + } +} + + +/* + * Find the gtm port and try a connection + */ +static bool +test_gtm_connection() +{ + GTM_Conn *conn; + bool success = false; + int i; + char portstr[32]; + char *p; + char *q; + char connstr[128]; /* Should be way more than enough! */ + + *portstr = '\0'; + + /* + * Look in gtm_opts for a -p switch. + * + * This parsing code is not amazingly bright; it could for instance + * get fooled if ' -p' occurs within a quoted argument value. Given + * that few people pass complicated settings in gtm_opts, it's + * probably good enough. + */ + for (p = gtm_opts; *p;) + { + /* advance past whitespace */ + while (isspace((unsigned char) *p)) + p++; + + if (strncmp(p, "-p", 2) == 0) + { + p += 2; + /* advance past any whitespace/quoting */ + while (isspace((unsigned char) *p) || *p == '\'' || *p == '"') + p++; + /* find end of value (not including any ending quote!) */ + q = p; + while (*q && + !(isspace((unsigned char) *q) || *q == '\'' || *q == '"')) + q++; + /* and save the argument value */ + strlcpy(portstr, p, Min((q - p) + 1, sizeof(portstr))); + /* keep looking, maybe there is another -p */ + p = q; + } + /* Advance to next whitespace */ + while (*p && !isspace((unsigned char) *p)) + p++; + } + + /* + * Search config file for a 'port' option. + * + * This parsing code isn't amazingly bright either, but it should be okay + * for valid port settings. + */ + if (!*portstr) + { + char **optlines; + + optlines = readfile(conf_file); + if (optlines != NULL) + { + for (; *optlines != NULL; optlines++) + { + p = *optlines; + + while (isspace((unsigned char) *p)) + p++; + if (strncmp(p, "port", 4) != 0) + continue; + p += 4; + while (isspace((unsigned char) *p)) + p++; + if (*p != '=') + continue; + p++; + /* advance past any whitespace/quoting */ + while (isspace((unsigned char) *p) || *p == '\'' || *p == '"') + p++; + /* find end of value (not including any ending quote/comment!) */ + q = p; + while (*q && + !(isspace((unsigned char) *q) || + *q == '\'' || *q == '"' || *q == '#')) + q++; + /* and save the argument value */ + strlcpy(portstr, p, Min((q - p) + 1, sizeof(portstr))); + /* keep looking, maybe there is another */ + } + } + } + + /* Still not found? Use compiled-in default */ +#define GTM_DEFAULT_PORT 6666 + if (!*portstr) + snprintf(portstr, sizeof(portstr), "%d", GTM_DEFAULT_PORT); + + /* + * We need to set a connect timeout otherwise on Windows the SCM will + * probably timeout first + * a PGXC node ID has to be set for GTM connection protocol, + * so its value doesn't really matter here. + */ + snprintf(connstr, sizeof(connstr), + "host=localhost port=%s connect_timeout=5 node_name=one", portstr); + + for (i = 0; i < wait_seconds; i++) + { + if ((conn = PQconnectGTM(connstr)) != NULL && + (GTMPQstatus(conn) == CONNECTION_OK)) + { + GTMPQfinish(conn); + success = true; + break; + } + else + { + GTMPQfinish(conn); + print_msg("."); + sleep(1); /* 1 sec */ + } + } + + return success; +} + +static void +read_gtm_opts(void) +{ + if (gtm_opts == NULL) + { + gtm_opts = ""; /* default */ + if (ctl_command == RESTART_COMMAND) + { + char **optlines; + + optlines = readfile(gtmopts_file); + if (optlines == NULL) + { + write_stderr(_("%s: could not read file \"%s\"\n"), progname, gtmopts_file); + exit(1); + } + else if (optlines[0] == NULL || optlines[1] != NULL) + { + write_stderr(_("%s: option file \"%s\" must have exactly one line\n"), + progname, gtmopts_file); + exit(1); + } + else + { + int len; + char *optline; + + optline = optlines[0]; + /* trim off line endings */ + len = strcspn(optline, "\r\n"); + optline[len] = '\0'; + + gtm_opts = optline; + } + } + } +} + +static void +do_start(void) +{ + pgpid_t pid; + pgpid_t old_pid = 0; + int exitcode; + + if (ctl_command != RESTART_COMMAND) + { + old_pid = get_pgpid(); + if (old_pid != 0) + write_stderr(_("%s: another server might be running; " + "trying to start server anyway\n"), + progname); + } + + read_gtm_opts(); + + exitcode = start_gtm(); + if (exitcode != 0) + { + write_stderr(_("%s: could not start server: exit code was %d\n"), + progname, exitcode); + exit(1); + } + + if (old_pid != 0) + { + sleep(1); + pid = get_pgpid(); + if (pid == old_pid) + { + write_stderr(_("%s: could not start server\n" + "Examine the log output.\n"), + progname); + exit(1); + } + } + + if (do_wait) + { + print_msg(_("waiting for server to start...")); + + if (test_gtm_connection() == false) + { + printf(_("could not start server\n")); + exit(1); + } + else + { + print_msg(_(" done\n")); + print_msg(_("server started\n")); + } + } + else + print_msg(_("server starting\n")); +} + + +static void +do_stop(void) +{ + int cnt; + pgpid_t pid; + + pid = get_pgpid(); + + if (pid == 0) /* no pid file */ + { + write_stderr(_("%s: PID file \"%s\" does not exist\n"), progname, pid_file); + write_stderr(_("Is server running?\n")); + exit(1); + } + else if (pid < 0) /* standalone backend, not gtm */ + { + pid = -pid; + write_stderr(_("%s: cannot stop server; " + "single-user server is running (PID: %ld)\n"), + progname, pid); + exit(1); + } + + if (kill((pid_t) pid, sig) != 0) + { + write_stderr(_("%s: could not send stop signal (PID: %ld): %s\n"), progname, pid, + strerror(errno)); + exit(1); + } + + if (!do_wait) + { + print_msg(_("server shutting down\n")); + return; + } + else + { + print_msg(_("waiting for server to shut down...")); + + for (cnt = 0; cnt < wait_seconds; cnt++) + { + if ((pid = get_pgpid()) != 0) + { + print_msg("."); + sleep(1); /* 1 sec */ + } + else + break; + } + + if (pid != 0) /* pid file still exists */ + { + print_msg(_(" failed\n")); + + write_stderr(_("%s: server does not shut down\n"), progname); + exit(1); + } + print_msg(_(" done\n")); + + printf(_("server stopped\n")); + } +} + +static void +do_promote(void) +{ + pgpid_t pid; + + pid = get_pgpid(); + + if (pid == 0) /* no pid file */ + { + write_stderr(_("%s: PID file \"%s\" does not exist\n"), progname, pid_file); + write_stderr(_("Is server running?\n")); + exit(1); + } + else if (pid < 0) /* standalone backend, not gtm */ + { + pid = -pid; + write_stderr(_("%s: cannot promote server; " + "single-user server is running (PID: %ld)\n"), + progname, pid); + exit(1); + } + + if (kill((pid_t) pid, SIGUSR1) != 0) + { + write_stderr(_("%s: could not send promote signal (PID: %ld): %s\n"), progname, pid, + strerror(errno)); + exit(1); + } +} + +/* + * At least we expect the following argument + * + * 1) -D datadir + * 2) -o options: we expect that -t and -s options are specified here. + * Check will be done in GTM-Proxy. If there's an error, it will be + * logged. In this case, GTM-Proxy won't terminate. It will continue + * to read/write with old GTM. + * + * Because they are not passed to gtm directly, they should appear in + * gtm_ctl argument, not in -o options. They're specific to gtm_ctl + * reconnect. + */ +static void +do_reconnect(void) +{ + pgpid_t pid; + char *reconnect_point_file_nam; + FILE *reconnect_point_file; + +#ifdef GTM_SBY_DEBUG + write_stderr("Reconnecting to new GTM ... DEBUG MODE."); +#endif + + /* + * Target must be "gtm_proxy" + */ + if (strcmp(gtm_app, "gtm_proxy") != 0) + { + write_stderr(_("%s: only gtm_proxy can accept reconnect command\n"), progname); + exit(1); + } + pid = get_pgpid(); + + if (pid == 0) /* no pid file */ + { + write_stderr(_("%s: PID file \"%s\" does not exist\n"), progname, pid_file); + write_stderr(_("Is server running?\n")); + exit(1); + } + else if (pid < 0) /* standalone backend, not gtm */ + { + pid = -pid; + write_stderr(_("%s: cannot promote server; " + "single-user server is running (PID: %ld)\n"), + progname, pid); + exit(1); + } + read_gtm_opts(); + /* + * Pass reconnect info to GTM-Proxy. + * + * Option arguments are written to new gtm file under -D directory. + */ + reconnect_point_file_nam = malloc(strlen(gtm_data) + 9); + if (reconnect_point_file_nam == NULL) + { + write_stderr(_("%s: No memory available.\n"), progname); + exit(1); + } + + snprintf(reconnect_point_file_nam, strlen(gtm_data) + 8, "%s/newgtm", gtm_data); + reconnect_point_file = fopen(reconnect_point_file_nam, "w"); + + if (reconnect_point_file == NULL) + { + write_stderr(_("%s: Cannot open reconnect point file %s\n"), progname, reconnect_point_file_nam); + exit(1); + } + + fprintf(reconnect_point_file, "%s\n", gtm_opts); + fclose(reconnect_point_file); + free(reconnect_point_file_nam); + + if (kill((pid_t) pid, SIGUSR1) != 0) + { + write_stderr(_("%s: could not send promote signal (PID: %ld): %s\n"), progname, pid, + strerror(errno)); + exit(1); + } +} + + +/* + * restart/reload routines + */ + +static void +do_restart(void) +{ + int cnt; + pgpid_t pid; + + pid = get_pgpid(); + + if (pid == 0) /* no pid file */ + { + write_stderr(_("%s: PID file \"%s\" does not exist\n"), + progname, pid_file); + write_stderr(_("Is server running?\n")); + write_stderr(_("starting server anyway\n")); + do_start(); + return; + } + else if (pid < 0) /* standalone backend, not gtm */ + { + pid = -pid; + if (gtm_is_alive((pid_t) pid)) + { + write_stderr(_("%s: cannot restart server; " + "single-user server is running (PID: %ld)\n"), + progname, pid); + write_stderr(_("Please terminate the single-user server and try again.\n")); + exit(1); + } + } + + if (gtm_is_alive((pid_t) pid)) + { + if (kill((pid_t) pid, sig) != 0) + { + write_stderr(_("%s: could not send stop signal (PID: %ld): %s\n"), progname, pid, + strerror(errno)); + exit(1); + } + + print_msg(_("waiting for server to shut down...")); + + /* always wait for restart */ + + for (cnt = 0; cnt < wait_seconds; cnt++) + { + if ((pid = get_pgpid()) != 0) + { + print_msg("."); + sleep(1); /* 1 sec */ + } + else + break; + } + + if (pid != 0) /* pid file still exists */ + { + print_msg(_(" failed\n")); + + write_stderr(_("%s: server does not shut down\n"), progname); + exit(1); + } + + print_msg(_(" done\n")); + printf(_("server stopped\n")); + } + else + { + write_stderr(_("%s: old server process (PID: %ld) seems to be gone\n"), + progname, pid); + write_stderr(_("starting server anyway\n")); + } + + do_start(); +} + + +static void +do_status(void) +{ + pgpid_t pid; + char datpath[MAXPGPATH]; + int mode; + FILE *pidf; + + /* + * Read a PID file to get GTM server status instead of attaching shared memory. + */ + pidf = fopen(pid_file, "r"); + if (pidf == NULL) + { + write_stderr(_("%s: could not open PID file \"%s\": %s\n"), + progname, pid_file, strerror(errno)); + exit(1); + } + + if (fscanf(pidf, "%ld", &pid) != 1) + { + write_stderr(_("%s: invalid data in PID file \"%s\"\n"), + progname, pid_file); + exit(1); + } + + if (fscanf(pidf, "%s", datpath) != 1) + { + write_stderr(_("%s: invalid data in PID file \"%s\"\n"), + progname, pid_file); + exit(1); + } + + if (fscanf(pidf, "%d", &mode) != 1) + { + write_stderr(_("%s: invalid data in PID file \"%s\"\n"), + progname, pid_file); + exit(1); + } + + fclose(pidf); + + pid = get_pgpid(); + + if (pid == 0) /* no pid file */ + { + write_stderr(_("%s: PID file \"%s\" does not exist\n"), + progname, pid_file); + write_stderr(_("Is server running?\n")); + exit(1); + } + else if (pid < 0) /* standalone backend, not gtm */ + { + pid = -pid; + if (gtm_is_alive((pid_t) pid)) + { + write_stderr(_("%s: cannot get server status; " + "single-user server is running (PID: %ld)\n"), + progname, pid); + write_stderr(_("Please terminate the single-user server and try again.\n")); + exit(1); + } + } + else + { + if (gtm_is_alive((pid_t) pid)) + { + char **optlines; + + printf(_("%s: server is running (PID: %ld)\n"), + progname, pid); + + optlines = readfile(gtmopts_file); + if (optlines != NULL) + for (; *optlines != NULL; optlines++) + fputs(*optlines, stdout); + return; + } + } + + write_stderr(_("%s: no server running\n"), progname); + exit(1); +} + + +/* + * utility routines + */ + +static bool +gtm_is_alive(pid_t pid) +{ + /* + * Test to see if the process is still there. Note that we do not + * consider an EPERM failure to mean that the process is still there; + * EPERM must mean that the given PID belongs to some other userid, and + * considering the permissions on $GTMDATA, that means it's not the + * gtm we are after. + * + * Don't believe that our own PID or parent shell's PID is the gtm, + * either. (Windows hasn't got getppid(), though.) + */ + if (pid == getpid()) + return false; +#ifndef WIN32 + if (pid == getppid()) + return false; +#endif + if (kill(pid, 0) == 0) + return true; + return false; +} + +static void +do_advice(void) +{ + write_stderr(_("Try \"%s --help\" for more information.\n"), progname); +} + + +static void +do_help(void) +{ + printf(_("%s is a utility to start, stop or restart,\n" + "a GTM server, a GTM standby or GTM proxy.\n\n"), progname); + printf(_("Usage:\n")); + printf(_(" %s start -Z STARTUP_MODE [-w] [-t SECS] [-D DATADIR] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname); + printf(_(" %s stop -Z STARTUP_MODE [-W] [-t SECS] [-D DATADIR] [-m SHUTDOWN-MODE]\n"), progname); + printf(_(" %s promote -Z STARTUP_MODE [-w] [-t SECS] [-D DATADIR]\n"), progname); + printf(_(" %s restart -Z STARTUP_MODE [-w] [-t SECS] [-D DATADIR] [-m SHUTDOWN-MODE]\n" + " [-o \"OPTIONS\"]\n"), progname); + printf(_(" %s status -Z STARTUP_MODE [-w] [-t SECS] [-D DATADIR]\n"), progname); + printf(_(" %s reconnect -Z STARTUP_MODE [-D DATADIR] -o \"OPTIONS\"]\n"), progname); + + printf(_("\nCommon options:\n")); + printf(_(" -D DATADIR location of the database storage area\n")); + printf(_(" -i nodename set gtm_proxy nodename registered on GTM\n")); + printf(_(" (option ignored if used with GTM)\n")); + printf(_(" -t SECS seconds to wait when using -w option\n")); + printf(_(" -w wait until operation completes\n")); + printf(_(" -W do not wait until operation completes\n")); + printf(_(" --help show this help, then exit\n")); + printf(_("(The default is to wait for shutdown, but not for start or restart.)\n\n")); + + printf(_("\nOptions for start or restart:\n")); + printf(_(" -l FILENAME write (or append) server log to FILENAME\n")); + printf(_(" -o OPTIONS command line options to pass to gtm\n" + " (GTM server executable)\n")); + printf(_(" -p PATH-TO-GTM/PROXY path to gtm/gtm_proxy executables\n")); + printf(_(" -Z STARTUP-MODE can be \"gtm\", \"gtm_standby\" or \"gtm_proxy\"\n")); + printf(_("\nOptions for stop or restart:\n")); + printf(_(" -m SHUTDOWN-MODE can be \"smart\", \"fast\", or \"immediate\"\n")); + + printf(_("\nOptions for reconnect:\n")); + printf(_(" -t NewGTMPORT Port number of new GTM.\n")); + printf(_(" -s NewGTMHost Host Name of new GTM.\n")); + + printf(_("\nShutdown modes are:\n")); + printf(_(" smart quit after all clients have disconnected\n")); + printf(_(" fast quit directly, with proper shutdown\n")); + printf(_(" immediate quit without complete shutdown; will lead to recovery on restart\n")); +} + + +static void +set_mode(char *modeopt) +{ + if (strcmp(modeopt, "s") == 0 || strcmp(modeopt, "smart") == 0) + { + shutdown_mode = SMART_MODE; + sig = SIGTERM; + } + else if (strcmp(modeopt, "f") == 0 || strcmp(modeopt, "fast") == 0) + { + shutdown_mode = FAST_MODE; + sig = SIGINT; + } + else if (strcmp(modeopt, "i") == 0 || strcmp(modeopt, "immediate") == 0) + { + shutdown_mode = IMMEDIATE_MODE; + sig = SIGQUIT; + } + else + { + write_stderr(_("%s: unrecognized shutdown mode \"%s\"\n"), progname, modeopt); + do_advice(); + exit(1); + } +} + +int +main(int argc, char **argv) +{ + int c; + char *nodename = NULL; /* GTM Proxy nodename */ + + progname = "gtm_ctl"; + + /* + * save argv[0] so do_start() can look for the gtm if necessary. we + * don't look for gtm here because in many cases we won't need it. + */ + argv0 = argv[0]; + + umask(077); + + /* support --help and --version even if invoked as root */ + if (argc > 1) + { + if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0 || + strcmp(argv[1], "-?") == 0) + { + do_help(); + exit(0); + } + } + + /* + * Disallow running as root, to forestall any possible security holes. + */ + if (geteuid() == 0) + { + write_stderr(_("%s: cannot be run as root\n" + "Please log in (using, e.g., \"su\") as the " + "(unprivileged) user that will\n" + "own the server process.\n"), + progname); + exit(1); + } + + /* + * 'Action' can be before or after args so loop over both. Some + * getopt_long() implementations will reorder argv[] to place all flags + * first (GNU?), but we don't rely on it. Our /port version doesn't do + * that. + */ + optind = 1; + + /* process command-line options */ + while (optind < argc) + { + while ((c = getopt(argc, argv, "D:i:l:m:o:p:t:wWZ:")) != -1) + { + switch (c) + { + case 'D': + { + char *gtmdata_D; + char *env_var = pg_malloc(strlen(optarg) + 9); + + gtmdata_D = xstrdup(optarg); + canonicalize_path(gtmdata_D); + snprintf(env_var, strlen(optarg) + 9, "GTMDATA=%s", + gtmdata_D); + putenv(env_var); + + /* + * We could pass GTMDATA just in an environment + * variable but we do -D too for clearer gtm + * 'ps' display + */ + gtmdata_opt = (char *) pg_malloc(strlen(gtmdata_D) + 8); + snprintf(gtmdata_opt, strlen(gtmdata_D) + 8, + "-D \"%s\" ", + gtmdata_D); + break; + } + case 'i': + nodename = strdup(optarg); + break; + case 'l': + log_file = xstrdup(optarg); + break; + case 'm': + set_mode(optarg); + break; + case 'o': + gtm_opts = xstrdup(optarg); + break; + case 'p': + gtm_path = xstrdup(optarg); + canonicalize_path(gtm_path); + break; + case 't': + wait_seconds = atoi(optarg); + break; + case 'w': + do_wait = true; + wait_set = true; + break; + case 'W': + do_wait = false; + wait_set = true; + break; + case 'Z': + gtm_app = xstrdup(optarg); + if (strcmp(gtm_app,"gtm_proxy") != 0 + && strcmp(gtm_app,"gtm_standby") != 0 + && strcmp(gtm_app,"gtm") != 0) + { + write_stderr(_("%s: %s launch name set not correct\n"), progname, gtm_app); + do_advice(); + exit(1); + } + break; + default: + /* getopt_long already issued a suitable error message */ + do_advice(); + exit(1); + } + } + + /* Process an action */ + if (optind < argc) + { + if (ctl_command != NO_COMMAND) + { + write_stderr(_("%s: too many command-line arguments (first is \"%s\")\n"), progname, argv[optind]); + do_advice(); + exit(1); + } + + if (strcmp(argv[optind], "start") == 0) + ctl_command = START_COMMAND; + else if (strcmp(argv[optind], "stop") == 0) + ctl_command = STOP_COMMAND; + else if (strcmp(argv[optind], "promote") == 0) + ctl_command = PROMOTE_COMMAND; + else if (strcmp(argv[optind], "restart") == 0) + ctl_command = RESTART_COMMAND; + else if (strcmp(argv[optind], "status") == 0) + ctl_command = STATUS_COMMAND; + else if (strcmp(argv[optind], "reconnect") == 0) + ctl_command = RECONNECT_COMMAND; + else + { + write_stderr(_("%s: unrecognized operation mode \"%s\"\n"), + progname, argv[optind]); + do_advice(); + exit(1); + } + optind++; + } + } + + if (ctl_command == NO_COMMAND) + { + write_stderr(_("%s: no operation specified\n"), progname); + do_advice(); + exit(1); + } + + gtm_data = getenv("GTMDATA"); + + if (gtm_data) + { + gtm_data = xstrdup(gtm_data); + canonicalize_path(gtm_data); + } + + if (!gtm_data) + { + write_stderr("%s: no GTM/GTM Proxy directory specified \n", + progname); + do_advice(); + exit(1); + } + + /* + * pid files of gtm and gtm proxy are named differently + * -Z option has also to be set for STOP_COMMAND + * or gtm_ctl will not be able to find the correct pid_file + */ + if (!gtm_app) + { + write_stderr("%s: no launch option not specified\n", + progname); + do_advice(); + exit(1); + } + + if (strcmp(gtm_app,"gtm_proxy") != 0 && + strcmp(gtm_app, "gtm_standby") != 0 && + strcmp(gtm_app,"gtm") != 0) + { + write_stderr(_("%s: launch option incorrect\n"), + progname); + do_advice(); + exit(1); + } + + /* Check if GTM Proxy ID is set, this is not necessary when stopping */ + if (ctl_command == START_COMMAND || + ctl_command == RESTART_COMMAND) + { + /* Rebuild option string to include Proxy ID */ + if (strcmp(gtm_app, "gtm_proxy") == 0) + { + gtmdata_opt = (char *) pg_realloc(gtmdata_opt, strlen(gtmdata_opt) + 9); + if (nodename) + sprintf(gtmdata_opt, "%s -i %s ", gtmdata_opt, nodename); + else + sprintf(gtmdata_opt, "%s ", gtmdata_opt); + } + } + + if (!wait_set) + { + switch (ctl_command) + { + case RESTART_COMMAND: + case START_COMMAND: + case PROMOTE_COMMAND: + case STATUS_COMMAND: + do_wait = false; + break; + case STOP_COMMAND: + do_wait = true; + break; + default: + break; + } + } + + /* Build strings for pid file and option file */ + if (strcmp(gtm_app,"gtm_proxy") == 0) + { + snprintf(pid_file, MAXPGPATH, "%s/gtm_proxy.pid", gtm_data); + snprintf(gtmopts_file, MAXPGPATH, "%s/gtm_proxy.opts", gtm_data); + snprintf(conf_file, MAXPGPATH, "%s/gtm_proxy.conf", gtm_data); + } + else if (strcmp(gtm_app,"gtm") == 0) + { + snprintf(pid_file, MAXPGPATH, "%s/gtm.pid", gtm_data); + snprintf(gtmopts_file, MAXPGPATH, "%s/gtm.opts", gtm_data); + snprintf(conf_file, MAXPGPATH, "%s/gtm.conf", gtm_data); + } + else if (strcmp(gtm_app,"gtm_standby") == 0) + { + snprintf(pid_file, MAXPGPATH, "%s/gtm.pid", gtm_data); + snprintf(gtmopts_file, MAXPGPATH, "%s/gtm.opts", gtm_data); + snprintf(conf_file, MAXPGPATH, "%s/gtm.conf", gtm_data); + } + + if (ctl_command==STATUS_COMMAND) + gtm_opts = xstrdup("-c"); + + switch (ctl_command) + { + case START_COMMAND: + do_start(); + break; + case STOP_COMMAND: + do_stop(); + break; + case PROMOTE_COMMAND: + do_promote(); + break; + case RESTART_COMMAND: + do_restart(); + break; + case STATUS_COMMAND: + do_status(); + break; + case RECONNECT_COMMAND: + do_reconnect(); + break; + default: + break; + } + + exit(0); +} + +/* + * Safer versions of standard realloc C library function. If an + * out-of-memory condition occurs, these functions will bail out + * safely; therefore, its return value is guaranteed to be non-NULL. + */ +static void * +pg_realloc(void *ptr, size_t size) +{ + void *tmp; + + tmp = realloc(ptr, size); + if (!tmp) + write_stderr("out of memory\n"); + return tmp; +} diff --git a/src/gtm/libpq/Makefile b/src/gtm/libpq/Makefile index 4cbd004628..dd22b0dcb1 100644 --- a/src/gtm/libpq/Makefile +++ b/src/gtm/libpq/Makefile @@ -11,17 +11,19 @@ top_builddir=../../.. include $(top_builddir)/src/Makefile.global subdir=src/gtm/libpq -include $(top_srcdir)/src/backend/common.mk +NAME=pqcomm +SO_MAJOR_VERSION= 1 +SO_MINOR_VERSION= 0 -OBJS = ip.o pqcomm.o pqformat.o strlcpy.o pqsignal.o +OBJS=ip.o pqcomm.o pqformat.o strlcpy.o pqsignal.o -all: libgtmpq.a +all:all-lib -libgtmpq.a: $(OBJS) - $(AR) $(AROPT) $@ $^ +include $(top_srcdir)/src/Makefile.shlib clean: - rm -f $(OBJS) libgtmpq.a + rm -f $(OBJS) + rm -f libpqcomm.so libpqcomm.so.1 libpqcomm.so.1.0 distclean: clean diff --git a/src/gtm/libpq/pqcomm.c b/src/gtm/libpq/pqcomm.c index 292b3c035b..d9a19a4707 100644 --- a/src/gtm/libpq/pqcomm.c +++ b/src/gtm/libpq/pqcomm.c @@ -297,16 +297,14 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber, continue; } -#define GTM_MAX_CONNECTIONS 1024 +#define GTM_MAX_CONNECTIONS 4096 /* * Select appropriate accept-queue length limit. PG_SOMAXCONN is only * intended to provide a clamp on the request on platforms where an * overly large request provokes a kernel error (are there any?). */ - maxconn = GTM_MAX_CONNECTIONS * 2; - - err = listen(fd, maxconn); + err = listen(fd, GTM_MAX_CONNECTIONS); if (err < 0) { ereport(LOG, diff --git a/src/gtm/main/Makefile b/src/gtm/main/Makefile index f85d977eac..d207e32dcc 100644 --- a/src/gtm/main/Makefile +++ b/src/gtm/main/Makefile @@ -15,38 +15,23 @@ ifneq ($(PORTNAME), win32) override CFLAGS += $(PTHREAD_CFLAGS) endif -SUBDIRS = $(top_builddir)/src/gtm/client \ - $(top_builddir)/src/gtm/common \ - $(top_builddir)/src/gtm/config \ - $(top_builddir)/src/gtm/libpq \ - $(top_builddir)/src/gtm/path \ - $(top_builddir)/src/gtm/recovery +OBJS=main.o gtm_thread.o gtm_txn.o gtm_seq.o gtm_snap.o gtm_time.o gtm_standby.o gtm_opt.o -include $(top_srcdir)/src/backend/common.mk +OTHERS= ../libpq/libpqcomm.a ../path/libgtmpath.a ../recovery/libgtmrecovery.a ../client/libgtmclient.a ../common/libgtm.a ../../port/libpgport.a -OBJS = $(SUBDIROBJS) \ - $(top_builddir)/src/port/libpgport_srv.a \ - main.o gtm_thread.o gtm_txn.o gtm_seq.o gtm_snap.o gtm_time.o \ - gtm_standby.o gtm_opt.o register_gtm.o replication.o +LDFLAGS=-L$(top_builddir)/common -L$(top_builddir)/libpq -LIBS += $(PTHREAD_LIBS) +LIBS=-lpthread -all: gtm +gtm:$(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ $(OTHERS) -o gtm -gtm: $(OBJS) | submake-libpgport - $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $(call expand_subsys,$^) -o $@ +all:gtm -install: all installdirs - $(INSTALL_PROGRAM) gtm$(X) '$(DESTDIR)$(bindir)/gtm$(X)' - $(INSTALL_DATA) $(srcdir)/gtm.conf.sample '$(DESTDIR)$(datadir)/gtm.conf.sample' +clean: + rm -f $(OBJS) + rm -f gtm -installdirs: - $(MKDIR_P) '$(DESTDIR)$(bindir)' '$(DESTDIR)$(datadir)' +distclean: clean -uninstall: - rm -f '$(DESTDIR)$(bindir)/gtm$(X)' '$(DESTDIR)$(datadir)/gtm.conf.sample' - -clean distclean maintainer-clean: - rm -f gtm$(X) $(OBJS) - -$(top_builddir)/src/port/libpgport_srv.a: | submake-libpgport +maintainer-clean: distclean diff --git a/src/gtm/main/gtm_opt.c b/src/gtm/main/gtm_opt.c index 4a0b20442f..f05be304cc 100644 --- a/src/gtm/main/gtm_opt.c +++ b/src/gtm/main/gtm_opt.c @@ -148,7 +148,7 @@ struct config_int ConfigureNamesInt[] = 0 }, >MPortNumber, - 6666, 0, INT_MAX, + 0, 0, INT_MAX, 0, NULL }, { diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c index b8d999b200..a9f2dd1b0a 100644 --- a/src/gtm/main/gtm_seq.c +++ b/src/gtm/main/gtm_seq.c @@ -3,6 +3,11 @@ * gtm_seq.c * Sequence handling on GTM * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -47,6 +52,9 @@ static int seq_add_seqinfo(GTM_SeqInfo *seqinfo); static int seq_remove_seqinfo(GTM_SeqInfo *seqinfo); static GTM_SequenceKey seq_copy_key(GTM_SequenceKey key); static int seq_drop_with_dbkey(GTM_SequenceKey nsp); +#ifdef XCP +static GTM_Sequence get_rangemax(GTM_SeqInfo *seqinfo, GTM_Sequence range); +#endif /* * Get the hash value given the sequence key @@ -331,8 +339,14 @@ GTM_SeqOpen(GTM_SequenceKey seqkey, */ seqinfo->gs_cycle = cycle; +#ifdef XCP + seqinfo->gs_max_lastvals = 0; + seqinfo->gs_lastval_count = 0; + seqinfo->gs_last_values = NULL; +#else /* Set the last value in case of a future restart */ seqinfo->gs_last_value = seqinfo->gs_init_value; +#endif if ((errcode = seq_add_seqinfo(seqinfo))) { @@ -386,14 +400,23 @@ int GTM_SeqAlter(GTM_SequenceKey seqkey, { /* Restart command has been used, reset the sequence */ seqinfo->gs_called = false; +#ifdef XCP + seqinfo->gs_value = lastval; +#else seqinfo->gs_init_value = seqinfo->gs_last_value = lastval; +#endif } +#ifdef XCP + if (seqinfo->gs_init_value != startval) + seqinfo->gs_init_value = startval; +#else else { /* Start has been used, reinitialize init value */ if (seqinfo->gs_init_value != startval) seqinfo->gs_init_value = seqinfo->gs_last_value = startval; } +#endif /* Remove the old key with the old name */ GTM_RWLockRelease(&seqinfo->gs_lock); @@ -433,7 +456,14 @@ GTM_SeqRestore(GTM_SequenceKey seqkey, seqinfo->gs_min_value = minval; seqinfo->gs_max_value = maxval; +#ifdef XCP + seqinfo->gs_init_value = startval; + seqinfo->gs_max_lastvals = 0; + seqinfo->gs_lastval_count = 0; + seqinfo->gs_last_values = NULL; +#else seqinfo->gs_init_value = seqinfo->gs_last_value = startval; +#endif seqinfo->gs_value = curval; /* @@ -561,7 +591,7 @@ seq_drop_with_dbkey(GTM_SequenceKey nsp) /* Sequence is not is busy state, it can be deleted safely */ bucket->shb_list = gtm_list_delete_cell(bucket->shb_list, cell, prev); - elog(LOG, "Sequence %s was deleted from GTM", + elog(DEBUG1, "Sequence %s was deleted from GTM", curr_seqinfo->gs_key->gsk_key); deleted = true; @@ -625,7 +655,17 @@ GTM_SeqRename(GTM_SequenceKey seqkey, GTM_SequenceKey newseqkey) newseqinfo->gs_cycle = seqinfo->gs_cycle; newseqinfo->gs_state = seqinfo->gs_state; +#ifdef XCP + newseqinfo->gs_max_lastvals = seqinfo->gs_max_lastvals; + newseqinfo->gs_lastval_count = seqinfo->gs_lastval_count; + newseqinfo->gs_last_values = (GTM_SeqLastVal *) + MemoryContextAlloc(TopMostMemoryContext, + newseqinfo->gs_max_lastvals * sizeof(GTM_SeqLastVal)); + memcpy(newseqinfo->gs_last_values, seqinfo->gs_last_values, + newseqinfo->gs_max_lastvals * sizeof(GTM_SeqLastVal)); +#else newseqinfo->gs_last_value = seqinfo->gs_last_value; +#endif /* Add the copy to the list */ if ((errcode = seq_add_seqinfo(newseqinfo))) /* a lock is taken here for the new sequence */ @@ -648,6 +688,333 @@ GTM_SeqRename(GTM_SequenceKey seqkey, GTM_SequenceKey newseqkey) return errcode; } +#ifdef XCP +/* + * Get current value for the sequence without incrementing it + */ +void +GTM_SeqGetCurrent(GTM_SequenceKey seqkey, char *coord_name, + int coord_procid, GTM_Sequence *result) +{ + GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey); + int i; + bool found = false; + + elog(DEBUG1, "Look up last value of Sequence %s in session %s:%d", + seqkey->gsk_key, coord_name, coord_procid); + + if (seqinfo == NULL) + { + ereport(ERROR, + (EINVAL, + errmsg("sequence \"%s\" does not exist", seqkey->gsk_key))); + return; + } + + GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_READ); + + for (i = 0; i < seqinfo->gs_lastval_count; i++) + { + if (strcmp(seqinfo->gs_last_values[i].gs_coord_name, coord_name) == 0 && + seqinfo->gs_last_values[i].gs_coord_procid == coord_procid) + { + *result = seqinfo->gs_last_values[i].gs_last_value; + found = true; + break; + } + } + + GTM_RWLockRelease(&seqinfo->gs_lock); + seq_release_seqinfo(seqinfo); + if (!found) + ereport(ERROR, + (ERANGE, + errmsg("currval of sequence \"%s\" is not yet defined in this session", + seqkey->gsk_key))); + +} + + +/* + * Store the sequence value as last for the specified distributed session + */ +static void +seq_set_lastval(GTM_SeqInfo *seqinfo, char *coord_name, + int coord_procid, GTM_Sequence newval) +{ + GTM_SeqLastVal *lastval; + int i; + + /* Can not assign value to not defined value */ + if (coord_name == NULL || coord_procid == 0) + return; + + elog(DEBUG1, "Remember last value of Sequence %s in session %s:%d", + seqinfo->gs_key->gsk_key, coord_name, coord_procid); + + /* + * If last value is already defined for the session update it + */ + for (i = 0; i < seqinfo->gs_lastval_count; i++) + { + if (strcmp(seqinfo->gs_last_values[i].gs_coord_name, coord_name) == 0 && + seqinfo->gs_last_values[i].gs_coord_procid == coord_procid) + { + seqinfo->gs_last_values[i].gs_last_value = newval; + return; + } + } + + /* Not found, add new entry for the distributed session */ + if (seqinfo->gs_lastval_count == seqinfo->gs_max_lastvals) + { + /* Need more room */ +#define INIT_LASTVALS 16 + + if (seqinfo->gs_max_lastvals == 0) + { + /* No values at all, palloc memory block */ + MemoryContext oldContext; + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + seqinfo->gs_last_values = (GTM_SeqLastVal *) + palloc(INIT_LASTVALS * sizeof(GTM_SeqLastVal)); + seqinfo->gs_max_lastvals = INIT_LASTVALS; + MemoryContextSwitchTo(oldContext); + } + else + { + /* Increase existing array */ + int newsize = seqinfo->gs_max_lastvals * 2; + seqinfo->gs_last_values = (GTM_SeqLastVal *) + repalloc(seqinfo->gs_last_values, + newsize * sizeof(GTM_SeqLastVal)); + seqinfo->gs_max_lastvals = newsize; + } + } + + /* Populate new entry */ + lastval = &seqinfo->gs_last_values[seqinfo->gs_lastval_count++]; + memcpy(lastval->gs_coord_name, coord_name, strlen(coord_name) + 1); + lastval->gs_coord_procid = coord_procid; + lastval->gs_last_value = newval; +} + + +/* + * Set values for the sequence + */ +int +GTM_SeqSetVal(GTM_SequenceKey seqkey, char *coord_name, + int coord_procid, GTM_Sequence nextval, bool iscalled) +{ + GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey); + + if (seqinfo == NULL) + { + ereport(LOG, + (EINVAL, + errmsg("The sequence with the given key does not exist"))); + + return EINVAL; + } + + GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE); + + seqinfo->gs_value = nextval; + seqinfo->gs_called = iscalled; + + /* If sequence is not called, update last value for the session */ + if (!iscalled) + seq_set_lastval(seqinfo, coord_name, coord_procid, nextval); + + /* Remove the old key with the old name */ + GTM_RWLockRelease(&seqinfo->gs_lock); + seq_release_seqinfo(seqinfo); + + return 0; +} + +/* + * Get next value for the sequence + */ +int +GTM_SeqGetNext(GTM_SequenceKey seqkey, char *coord_name, + int coord_procid, GTM_Sequence range, + GTM_Sequence *result, GTM_Sequence *rangemax) +{ + GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey); + + if (seqinfo == NULL) + { + ereport(LOG, + (EINVAL, + errmsg("The sequence with the given key does not exist"))); + return EINVAL; + } + + GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE); + + /* + * If the sequence is called for the first time return the current value. + * It should be already initialized. + */ + if (!SEQ_IS_CALLED(seqinfo)) + { + *result = seqinfo->gs_value; + seqinfo->gs_called = true; + } + else + { + if (SEQ_IS_ASCENDING(seqinfo)) + { + /* + * Check if the sequence is about to wrap-around. If the sequence + * does not support wrap-around, throw an error. + * Beware overflow! + */ + if (seqinfo->gs_max_value - seqinfo->gs_increment_by + >= seqinfo->gs_value) + { + int newval = seqinfo->gs_value + seqinfo->gs_increment_by; + *result = seqinfo->gs_value = newval; + } + else if (SEQ_IS_CYCLE(seqinfo)) + *result = seqinfo->gs_value = seqinfo->gs_min_value; + else + { + GTM_RWLockRelease(&seqinfo->gs_lock); + seq_release_seqinfo(seqinfo); + ereport(LOG, + (ERANGE, + errmsg("Sequence reached maximum value"))); + return ERANGE; + } + } + else + { + /* + * Check if the sequence is about to wrap-around. If the sequence + * does not support wrap-around, throw an error. + * Beware overflow! + * + * Note: The gs_increment_by is a signed integer and is negative for + * descending sequences. So we don't need special handling below + */ + if (seqinfo->gs_min_value - seqinfo->gs_increment_by + <= seqinfo->gs_value) + { + int newval = seqinfo->gs_value + seqinfo->gs_increment_by; + *result = seqinfo->gs_value = newval; + } + else if (SEQ_IS_CYCLE(seqinfo)) + *result = seqinfo->gs_value = seqinfo->gs_max_value; + else + { + GTM_RWLockRelease(&seqinfo->gs_lock); + seq_release_seqinfo(seqinfo); + ereport(LOG, + (ERANGE, + errmsg("Sequence reached maximum value"))); + return ERANGE; + } + } + } + /* if range is specified calculate valid max value for this range */ + if (range > 1) + *rangemax = get_rangemax(seqinfo, range); + else + *rangemax = *result; + /* + * lastval has to be set to rangemax obtained above because + * values upto it will be consumed by this nextval caller and + * the next caller should get values starting above this + * lastval. Same reasoning for gs_value, but we still return + * result as the first calculated gs_value above to form the + * local starting seed at the caller. This will go upto the + * rangemax value before contacting GTM again.. + */ + seq_set_lastval(seqinfo, coord_name, coord_procid, *rangemax); + seqinfo->gs_value = *rangemax; + GTM_RWLockRelease(&seqinfo->gs_lock); + seq_release_seqinfo(seqinfo); + return 0; +} + +/* + * Given a sequence and the requested range for its values, calculate + * the legitimate maximum permissible value for this range. In + * particular we need to be careful about overflow and underflow for + * mix and max types of sequences.. + */ +static GTM_Sequence +get_rangemax(GTM_SeqInfo *seqinfo, GTM_Sequence range) +{ + GTM_Sequence rangemax = seqinfo->gs_value; + + /* + * Deduct 1 from range because the currval has been accounted + * for already before this call has been made + */ + range--; + if (SEQ_IS_ASCENDING(seqinfo)) + { + /* + * Check if the sequence will overflow because of the range + * request. If yes, cap it at close to or equal to max value + */ + while (range != 0 && + (seqinfo->gs_max_value - seqinfo->gs_increment_by >= + rangemax)) + { + rangemax += seqinfo->gs_increment_by; + range--; + } + } + else + { + /* + * Check if the sequence will underflow because of the range + * request. If yes, cap it at close to or equal to min value + * + * Note: The gs_increment_by is a signed integer and is negative for + * descending sequences. So we don't need special handling below + */ + while (range != 0 && + (seqinfo->gs_min_value - seqinfo->gs_increment_by <= + rangemax)) + { + rangemax += seqinfo->gs_increment_by; + range--; + } + } + return rangemax; +} +#else +/* + * Get current value for the sequence without incrementing it + */ +GTM_Sequence +GTM_SeqGetCurrent(GTM_SequenceKey seqkey) +{ + GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey); + GTM_Sequence value; + + if (seqinfo == NULL) + { + ereport(LOG, + (EINVAL, + errmsg("The sequence with the given key does not exist"))); + return InvalidSequenceValue; + } + + GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE); + + value = seqinfo->gs_last_value; + + GTM_RWLockRelease(&seqinfo->gs_lock); + seq_release_seqinfo(seqinfo); + return value; +} /* * Set values for the sequence @@ -769,6 +1136,7 @@ GTM_SeqGetNext(GTM_SequenceKey seqkey) seq_release_seqinfo(seqinfo); return value; } +#endif /* * Reset the sequence @@ -787,7 +1155,11 @@ GTM_SeqReset(GTM_SequenceKey seqkey) } GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE); +#ifdef XCP + seqinfo->gs_value = seqinfo->gs_init_value; +#else seqinfo->gs_value = seqinfo->gs_last_value = seqinfo->gs_init_value; +#endif GTM_RWLockRelease(&seqinfo->gs_lock); seq_release_seqinfo(seqinfo); @@ -856,7 +1228,7 @@ ProcessSequenceInitCommand(Port *myport, StringInfo message, bool is_backup) MemoryContextSwitchTo(oldContext); - elog(LOG, "Opening sequence %s", seqkey.gsk_key); + elog(DEBUG1, "Opening sequence %s", seqkey.gsk_key); pq_getmsgend(message); @@ -869,7 +1241,7 @@ ProcessSequenceInitCommand(Port *myport, StringInfo message, bool is_backup) GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling open_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); + elog(DEBUG1, "calling open_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: rc = bkup_open_sequence(GetMyThreadInfo->thr_conn->standby, @@ -887,8 +1259,12 @@ ProcessSequenceInitCommand(Port *myport, StringInfo message, bool is_backup) if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "open_sequence() returns rc %d.", rc); + elog(DEBUG1, "open_sequence() returns rc %d.", rc); } +#ifdef XCP + /* Save control file with new seq info */ + SaveControlInfo(); +#endif /* * Send a SUCCESS message back to the client */ @@ -961,7 +1337,7 @@ ProcessSequenceAlterCommand(Port *myport, StringInfo message, bool is_backup) */ oldContext = MemoryContextSwitchTo(TopMostMemoryContext); - elog(LOG, "Altering sequence key %s", seqkey.gsk_key); + elog(DEBUG1, "Altering sequence key %s", seqkey.gsk_key); if ((errcode = GTM_SeqAlter(&seqkey, increment, minval, maxval, startval, lastval, cycle, is_restart))) ereport(ERROR, @@ -981,7 +1357,7 @@ ProcessSequenceAlterCommand(Port *myport, StringInfo message, bool is_backup) GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling alter_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); + elog(DEBUG1, "calling alter_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: rc = bkup_alter_sequence(GetMyThreadInfo->thr_conn->standby, @@ -1001,8 +1377,12 @@ ProcessSequenceAlterCommand(Port *myport, StringInfo message, bool is_backup) if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "alter_sequence() returns rc %d.", rc); + elog(DEBUG1, "alter_sequence() returns rc %d.", rc); } +#ifdef XCP + /* Save control file info */ + SaveControlInfo(); +#endif pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SEQUENCE_ALTER_RESULT, 4); if (myport->remote_type == GTM_NODE_GTM_PROXY) @@ -1034,9 +1414,9 @@ void ProcessSequenceListCommand(Port *myport, StringInfo message) { StringInfoData buf; - int seq_count = 0; - MemoryContext oldContext; - GTM_SeqInfo *seq_list[1024]; /* FIXME: make it expandable. */ + int seq_count; + int seq_maxcount; + GTM_SeqInfo **seq_list; int i; if (Recovery_IsStandby()) @@ -1044,39 +1424,57 @@ ProcessSequenceListCommand(Port *myport, StringInfo message) (EPERM, errmsg("Operation not permitted under the standby mode."))); - memset(seq_list, 0, sizeof(GTM_SeqInfo *) * 1024); - - /* - * We must use the TopMostMemoryContext because the sequence information is - * not bound to a thread and can outlive any of the thread specific - * contextes. - */ - oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + seq_count = 0; + seq_maxcount = 1024; + seq_list = (GTM_SeqInfo **) palloc(seq_maxcount * sizeof(GTM_SeqInfo *));; /* * Store pointers to all GTM_SeqInfo in the hash buckets into an array. */ + for (i = 0 ; i < SEQ_HASH_TABLE_SIZE ; i++) { GTM_SeqInfoHashBucket *b; gtm_ListCell *elem; - for (i = 0 ; i < SEQ_HASH_TABLE_SIZE ; i++) - { - b = >MSequences[i]; + b = >MSequences[i]; - GTM_RWLockAcquire(&b->shb_lock, GTM_LOCKMODE_READ); + GTM_RWLockAcquire(&b->shb_lock, GTM_LOCKMODE_READ); - gtm_foreach(elem, b->shb_list) + gtm_foreach(elem, b->shb_list) + { + /* Allocate larger array if required */ + if (seq_count == seq_maxcount) { - seq_list[seq_count] = (GTM_SeqInfo *) gtm_lfirst(elem); - seq_count++; + int newcount; + GTM_SeqInfo **newlist; + + newcount = 2 * seq_maxcount; + newlist = (GTM_SeqInfo **) repalloc(seq_list, newcount * sizeof(GTM_SeqInfo *)); + /* + * If failed try to get less. It is unlikely to happen, but + * let's be safe. + */ + while (newlist == NULL) + { + newcount = seq_maxcount + (newcount - seq_maxcount) / 2 - 1; + if (newcount <= seq_maxcount) + { + /* give up */ + ereport(ERROR, + (ERANGE, + errmsg("Can not list all the sequences"))); + } + newlist = (GTM_SeqInfo **) repalloc(seq_list, newcount * sizeof(GTM_SeqInfo *)); + } + seq_maxcount = newcount; + seq_list = newlist; } - - GTM_RWLockRelease(&b->shb_lock); + seq_list[seq_count] = (GTM_SeqInfo *) gtm_lfirst(elem); + seq_count++; } - } - MemoryContextSwitchTo(oldContext); + GTM_RWLockRelease(&b->shb_lock); + } pq_getmsgend(message); @@ -1093,27 +1491,38 @@ ProcessSequenceListCommand(Port *myport, StringInfo message) /* Send a number of sequences */ pq_sendint(&buf, seq_count, 4); - for (i = 0 ; i < seq_count ; i++) + /* + * Send sequences from the array + */ { - char *seq_buf; - size_t seq_buflen; - - seq_buflen = gtm_get_sequence_size(seq_list[i]); - seq_buf = (char *)malloc(seq_buflen); + /* + * TODO set initial size big enough to fit any sequence, and avoid + * reallocations. + */ + size_t seq_maxlen = 256; + char *seq_buf = (char *) palloc(seq_maxlen); - gtm_serialize_sequence(seq_list[i], seq_buf, seq_buflen); + for (i = 0 ; i < seq_count ; i++) + { + size_t seq_buflen = gtm_get_sequence_size(seq_list[i]); + if (seq_buflen > seq_maxlen) + { + seq_maxlen = seq_buflen; + seq_buf = (char *)repalloc(seq_buf, seq_maxlen); + } - elog(LOG, "seq_buflen = %ld", seq_buflen); + gtm_serialize_sequence(seq_list[i], seq_buf, seq_buflen); - pq_sendint(&buf, seq_buflen, 4); - pq_sendbytes(&buf, seq_buf, seq_buflen); + elog(DEBUG1, "seq_buflen = %ld", seq_buflen); - free(seq_buf); + pq_sendint(&buf, seq_buflen, 4); + pq_sendbytes(&buf, seq_buf, seq_buflen); + } } pq_endmessage(myport, &buf); - elog(LOG, "ProcessSequenceListCommand() done."); + elog(DEBUG1, "ProcessSequenceListCommand() done."); if (myport->remote_type != GTM_NODE_GTM_PROXY) /* Don't flush to the backup because this does not change the internal status */ @@ -1122,6 +1531,85 @@ ProcessSequenceListCommand(Port *myport, StringInfo message) /* + * Process MSG_SEQUENCE_GET_CURRENT message + */ +void +ProcessSequenceGetCurrentCommand(Port *myport, StringInfo message) +{ + GTM_SequenceKeyData seqkey; + StringInfoData buf; + GTM_Sequence seqval; +#ifdef XCP + uint32 coord_namelen; + char *coord_name; + uint32 coord_procid; +#endif + + seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen)); + seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen); +#ifdef XCP + coord_namelen = pq_getmsgint(message, sizeof(coord_namelen)); + if (coord_namelen > 0) + coord_name = (char *)pq_getmsgbytes(message, coord_namelen); + else + coord_name = NULL; + coord_procid = pq_getmsgint(message, sizeof(coord_procid)); + + GTM_SeqGetCurrent(&seqkey, coord_name, coord_procid, &seqval); +#else + seqval = GTM_SeqGetCurrent(&seqkey); + if (!SEQVAL_IS_VALID(seqval)) + ereport(ERROR, + (ERANGE, + errmsg("Can not get current value of the sequence"))); +#endif + + elog(DEBUG1, "Getting current value %ld for sequence %s", seqval, seqkey.gsk_key); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, SEQUENCE_GET_CURRENT_RESULT, 4); + if (myport->remote_type == GTM_NODE_GTM_PROXY) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendint(&buf, seqkey.gsk_keylen, 4); + pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen); + pq_sendbytes(&buf, (char *)&seqval, sizeof (GTM_Sequence)); + pq_endmessage(myport, &buf); + + if (myport->remote_type != GTM_NODE_GTM_PROXY) + /* Don't flush to the standby because this does not change the status */ + pq_flush(myport); + + /* + * I don't think backup is needed here. It does not change internal state. + * 27th Dec., 2011, K.Suzuki + */ +#if 0 + if (GetMyThreadInfo->thr_conn->standby) + { + GTM_Sequence loc_seq; + GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; + int count = 0; + + elog(DEBUG1, "calling get_current() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); + +retry: + loc_seq = get_current(GetMyThreadInfo->thr_conn->standby, &seqkey); + + if (gtm_standby_check_communication_error(&count, oldconn)) + goto retry; + + elog(DEBUG1, "get_current() returns GTM_Sequence %ld.", loc_seq); + } +#endif + + /* FIXME: need to check errors */ +} + +/* * Process MSG_SEQUENCE_GET_NEXT/MSG_BKUP_SEQUENCE_GET_NEXT message * * is_backup indicates the message is MSG_BKUP_SEQUENCE_GET_NEXT @@ -1132,17 +1620,40 @@ ProcessSequenceGetNextCommand(Port *myport, StringInfo message, bool is_backup) GTM_SequenceKeyData seqkey; StringInfoData buf; GTM_Sequence seqval; +#ifdef XCP + GTM_Sequence range; + GTM_Sequence rangemax; + uint32 coord_namelen; + char *coord_name; + uint32 coord_procid; +#endif seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen)); seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen); +#ifdef XCP + coord_namelen = pq_getmsgint(message, sizeof(coord_namelen)); + if (coord_namelen > 0) + coord_name = (char *)pq_getmsgbytes(message, coord_namelen); + else + coord_name = NULL; + coord_procid = pq_getmsgint(message, sizeof(coord_procid)); + memcpy(&range, pq_getmsgbytes(message, sizeof (GTM_Sequence)), + sizeof (GTM_Sequence)); + if (GTM_SeqGetNext(&seqkey, coord_name, coord_procid, range, + &seqval, &rangemax)) + ereport(ERROR, + (ERANGE, + errmsg("Can not get current value of the sequence"))); +#else seqval = GTM_SeqGetNext(&seqkey); if (!SEQVAL_IS_VALID(seqval)) ereport(ERROR, (ERANGE, errmsg("Can not get current value of the sequence"))); +#endif - elog(LOG, "Getting next value %ld for sequence %s", seqval, seqkey.gsk_key); + elog(DEBUG1, "Getting next value %ld for sequence %s", seqval, seqkey.gsk_key); if (!is_backup) { @@ -1153,10 +1664,16 @@ ProcessSequenceGetNextCommand(Port *myport, StringInfo message, bool is_backup) GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling get_next() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); + elog(DEBUG1, "calling get_next() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: +#ifdef XCP + bkup_get_next(GetMyThreadInfo->thr_conn->standby, &seqkey, + coord_name, coord_procid, + range, &loc_seq, &rangemax); +#else loc_seq = bkup_get_next(GetMyThreadInfo->thr_conn->standby, &seqkey); +#endif if (gtm_standby_check_communication_error(&count, oldconn)) goto retry; @@ -1165,8 +1682,12 @@ ProcessSequenceGetNextCommand(Port *myport, StringInfo message, bool is_backup) if (Backup_synchronously &&(myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "get_next() returns GTM_Sequence %ld.", loc_seq); + elog(DEBUG1, "get_next() returns GTM_Sequence %ld.", loc_seq); } +#ifdef XCP + /* Save control file info */ + SaveControlInfo(); +#endif /* Respond to the client */ pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SEQUENCE_GET_NEXT_RESULT, 4); @@ -1179,6 +1700,9 @@ ProcessSequenceGetNextCommand(Port *myport, StringInfo message, bool is_backup) pq_sendint(&buf, seqkey.gsk_keylen, 4); pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen); pq_sendbytes(&buf, (char *)&seqval, sizeof (GTM_Sequence)); +#ifdef XCP + pq_sendbytes(&buf, (char *)&rangemax, sizeof (GTM_Sequence)); +#endif pq_endmessage(myport, &buf); if (myport->remote_type != GTM_NODE_GTM_PROXY) @@ -1207,12 +1731,25 @@ ProcessSequenceSetValCommand(Port *myport, StringInfo message, bool is_backup) StringInfoData buf; bool iscalled; int errcode; +#ifdef XCP + uint32 coord_namelen; + char *coord_name; + uint32 coord_procid; +#endif /* * Get the sequence key */ seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen)); seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen); +#ifdef XCP + coord_namelen = pq_getmsgint(message, sizeof(coord_namelen)); + if (coord_namelen > 0) + coord_name = (char *)pq_getmsgbytes(message, coord_namelen); + else + coord_name = NULL; + coord_procid = pq_getmsgint(message, sizeof(coord_procid)); +#endif /* Read parameters to be set */ memcpy(&nextval, pq_getmsgbytes(message, sizeof (GTM_Sequence)), @@ -1227,12 +1764,19 @@ ProcessSequenceSetValCommand(Port *myport, StringInfo message, bool is_backup) */ oldContext = MemoryContextSwitchTo(TopMostMemoryContext); - elog(LOG, "Setting new value %ld for sequence %s", nextval, seqkey.gsk_key); + elog(DEBUG1, "Setting new value %ld for sequence %s", nextval, seqkey.gsk_key); +#ifdef XCP + if ((errcode = GTM_SeqSetVal(&seqkey, coord_name, coord_procid, nextval, iscalled))) + ereport(ERROR, + (errcode, + errmsg("Failed to set values of sequence"))); +#else if ((errcode = GTM_SeqSetVal(&seqkey, nextval, iscalled))) ereport(ERROR, (errcode, errmsg("Failed to set values of sequence"))); +#endif MemoryContextSwitchTo(oldContext); @@ -1247,13 +1791,22 @@ ProcessSequenceSetValCommand(Port *myport, StringInfo message, bool is_backup) GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling set_val() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); + elog(DEBUG1, "calling set_val() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: +#ifdef XCP + rc = bkup_set_val(GetMyThreadInfo->thr_conn->standby, + &seqkey, + coord_name, + coord_procid, + nextval, + iscalled); +#else rc = bkup_set_val(GetMyThreadInfo->thr_conn->standby, &seqkey, nextval, iscalled); +#endif if (gtm_standby_check_communication_error(&count, oldconn)) goto retry; @@ -1262,8 +1815,12 @@ ProcessSequenceSetValCommand(Port *myport, StringInfo message, bool is_backup) if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "set_val() returns rc %d.", rc); + elog(DEBUG1, "set_val() returns rc %d.", rc); } +#ifdef XCP + /* Save control file info */ + SaveControlInfo(); +#endif /* Respond to the client */ pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SEQUENCE_SET_VAL_RESULT, 4); @@ -1304,7 +1861,7 @@ ProcessSequenceResetCommand(Port *myport, StringInfo message, bool is_backup) seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen)); seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen); - elog(LOG, "Resetting sequence %s", seqkey.gsk_key); + elog(DEBUG1, "Resetting sequence %s", seqkey.gsk_key); if ((errcode = GTM_SeqReset(&seqkey))) ereport(ERROR, @@ -1320,7 +1877,7 @@ ProcessSequenceResetCommand(Port *myport, StringInfo message, bool is_backup) GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling reset_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); + elog(DEBUG1, "calling reset_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: rc = bkup_reset_sequence(GetMyThreadInfo->thr_conn->standby, &seqkey); @@ -1332,8 +1889,12 @@ ProcessSequenceResetCommand(Port *myport, StringInfo message, bool is_backup) if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "reset_sequence() returns rc %d.", rc); + elog(DEBUG1, "reset_sequence() returns rc %d.", rc); } +#ifdef XCP + /* Save control file info */ + SaveControlInfo(); +#endif /* Respond to the client */ pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SEQUENCE_RESET_RESULT, 4); @@ -1376,7 +1937,7 @@ ProcessSequenceCloseCommand(Port *myport, StringInfo message, bool is_backup) memcpy(&seqkey.gsk_type, pq_getmsgbytes(message, sizeof (GTM_SequenceKeyType)), sizeof (GTM_SequenceKeyType)); - elog(LOG, "Closing sequence %s", seqkey.gsk_key); + elog(DEBUG1, "Closing sequence %s", seqkey.gsk_key); if ((errcode = GTM_SeqClose(&seqkey))) ereport(ERROR, @@ -1392,7 +1953,7 @@ ProcessSequenceCloseCommand(Port *myport, StringInfo message, bool is_backup) GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling close_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); + elog(DEBUG1, "calling close_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: rc = bkup_close_sequence(GetMyThreadInfo->thr_conn->standby, &seqkey); @@ -1404,8 +1965,12 @@ ProcessSequenceCloseCommand(Port *myport, StringInfo message, bool is_backup) if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "close_sequence() returns rc %d.", rc); + elog(DEBUG1, "close_sequence() returns rc %d.", rc); } +#ifdef XCP + /* Save control file info */ + SaveControlInfo(); +#endif /* Respond to the client */ pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SEQUENCE_CLOSE_RESULT, 4); @@ -1459,7 +2024,7 @@ ProcessSequenceRenameCommand(Port *myport, StringInfo message, bool is_backup) */ oldContext = MemoryContextSwitchTo(TopMostMemoryContext); - elog(LOG, "Renaming sequence %s to %s", seqkey.gsk_key, newseqkey.gsk_key); + elog(DEBUG1, "Renaming sequence %s to %s", seqkey.gsk_key, newseqkey.gsk_key); if ((errcode = GTM_SeqRename(&seqkey, &newseqkey))) ereport(ERROR, @@ -1479,7 +2044,7 @@ ProcessSequenceRenameCommand(Port *myport, StringInfo message, bool is_backup) GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling rename_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); + elog(DEBUG1, "calling rename_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: rc = bkup_rename_sequence(GetMyThreadInfo->thr_conn->standby, &seqkey, &newseqkey); @@ -1491,8 +2056,12 @@ ProcessSequenceRenameCommand(Port *myport, StringInfo message, bool is_backup) if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "rename_sequence() returns rc %d.", rc); + elog(DEBUG1, "rename_sequence() returns rc %d.", rc); } +#ifdef XCP + /* Save control file info */ + SaveControlInfo(); +#endif /* Send a SUCCESS message back to the client */ pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SEQUENCE_RENAME_RESULT, 4); @@ -1752,3 +2321,67 @@ GTM_RestoreSeqInfo(FILE *ctlf) state, cycle, called); } } + + +#ifdef XCP +/* + * Remove all current values allocated for the specified session from all + * sequences. + */ +void +GTM_CleanupSeqSession(char *coord_name, int coord_procid) +{ + int i; + + elog(DEBUG1, "Clean up Sequences used in session %s:%d", + coord_name, coord_procid); + + for (i = 0; i < SEQ_HASH_TABLE_SIZE; i++) + { + GTM_SeqInfoHashBucket *bucket = >MSequences[i]; + gtm_ListCell *elem; + GTM_SeqInfo *curr_seqinfo; + + GTM_RWLockAcquire(&bucket->shb_lock, GTM_LOCKMODE_READ); + + gtm_foreach(elem, bucket->shb_list) + { + int j; + curr_seqinfo = (GTM_SeqInfo *) gtm_lfirst(elem); + GTM_RWLockAcquire(&curr_seqinfo->gs_lock, GTM_LOCKMODE_WRITE); + if (curr_seqinfo->gs_state != SEQ_STATE_ACTIVE) + { + GTM_RWLockRelease(&curr_seqinfo->gs_lock); + continue; + } + + for (j = 0; j < curr_seqinfo->gs_lastval_count; j++) + { + GTM_SeqLastVal *lastval = &curr_seqinfo->gs_last_values[j]; + if (strcmp(lastval->gs_coord_name, coord_name) == 0 && + lastval->gs_coord_procid == coord_procid) + { + int newcount = --curr_seqinfo->gs_lastval_count; + elog(DEBUG1, "remove value of Sequence %s acquired for session %s:%d", + curr_seqinfo->gs_key->gsk_key, lastval->gs_coord_name, + lastval->gs_coord_procid); + if (j < newcount) + memcpy(lastval, &curr_seqinfo->gs_last_values[newcount], + sizeof(GTM_SeqLastVal)); + if (curr_seqinfo->gs_lastval_count == 0) + { + elog(DEBUG1, "Sequence %s is not used, free curr values memory", + curr_seqinfo->gs_key->gsk_key); + curr_seqinfo->gs_max_lastvals = 0; + pfree(curr_seqinfo->gs_last_values); + curr_seqinfo->gs_last_values = NULL; + } + break; + } + } + GTM_RWLockRelease(&curr_seqinfo->gs_lock); + } + GTM_RWLockRelease(&bucket->shb_lock); + } +} +#endif diff --git a/src/gtm/main/gtm_snap.c b/src/gtm/main/gtm_snap.c index c8bf718cb9..3ecd0d6dcc 100644 --- a/src/gtm/main/gtm_snap.c +++ b/src/gtm/main/gtm_snap.c @@ -304,7 +304,7 @@ ProcessGetSnapshotCommand(Port *myport, StringInfo message, bool get_gxid) (EPROTO, errmsg("Message does not contain valid GXID"))); memcpy(&gxid, data, sizeof(gxid)); - elog(LOG, "Received transaction ID %d for snapshot obtention", gxid); + elog(INFO, "Received transaction ID %d for snapshot obtention", gxid); txn = GTM_GXIDToHandle(gxid); } else @@ -459,7 +459,7 @@ ProcessGetSnapshotCommandMulti(Port *myport, StringInfo message) GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; retry: - elog(LOG, "calling snapshot_get_multi() for standby GTM %p.", + elog(DEBUG1, "calling snapshot_get_multi() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); _rc = snapshot_get_multi(GetMyThreadInfo->thr_conn->standby, @@ -469,7 +469,7 @@ retry: if (gtm_standby_check_communication_error(&count, oldconn)) goto retry; - elog(LOG, "snapshot_get_multi() rc=%d done.", _rc); + elog(DEBUG1, "snapshot_get_multi() rc=%d done.", _rc); } #endif diff --git a/src/gtm/main/gtm_standby.c b/src/gtm/main/gtm_standby.c index c4baa7cf8f..741d5eaad2 100644 --- a/src/gtm/main/gtm_standby.c +++ b/src/gtm/main/gtm_standby.c @@ -18,15 +18,12 @@ #include "gtm/elog.h" #include "gtm/gtm.h" #include "gtm/gtm_c.h" +#include "gtm/standby_utils.h" #include "gtm/gtm_client.h" #include "gtm/gtm_seq.h" #include "gtm/gtm_serialize.h" #include "gtm/gtm_utils.h" -#include "gtm/libpq.h" -#include "gtm/pqformat.h" #include "gtm/register.h" -#include "gtm/standby_utils.h" -#include "gtm/stringinfo.h" GTM_Conn *GTM_ActiveConn = NULL; static char standbyHostName[NI_MAXHOST]; @@ -59,12 +56,12 @@ gtm_standby_start_startup(void) int gtm_standby_finish_startup(void) { - elog(LOG, "Closing a startup connection..."); + elog(DEBUG1, "Closing a startup connection..."); GTMPQfinish(GTM_ActiveConn); GTM_ActiveConn = NULL; - elog(LOG, "A startup connection closed."); + elog(DEBUG1, "A startup connection closed."); return 1; } @@ -76,36 +73,36 @@ gtm_standby_restore_next_gxid(void) next_gxid = get_next_gxid(GTM_ActiveConn); GTM_RestoreTxnInfo(NULL, next_gxid); - elog(LOG, "Restoring the next GXID done."); + elog(DEBUG1, "Restoring the next GXID done."); return 1; } int gtm_standby_restore_sequence(void) { - GTM_SeqInfo *seq_list[1024]; + GTM_SeqInfo *seq_list; int num_seq; int i; /* * Restore sequence data. */ - num_seq = get_sequence_list(GTM_ActiveConn, seq_list, 1024); + num_seq = get_sequence_list(GTM_ActiveConn, &seq_list); for (i = 0; i < num_seq; i++) { - GTM_SeqRestore(seq_list[i]->gs_key, - seq_list[i]->gs_increment_by, - seq_list[i]->gs_min_value, - seq_list[i]->gs_max_value, - seq_list[i]->gs_init_value, - seq_list[i]->gs_value, - seq_list[i]->gs_state, - seq_list[i]->gs_cycle, - seq_list[i]->gs_called); + GTM_SeqRestore(seq_list[i].gs_key, + seq_list[i].gs_increment_by, + seq_list[i].gs_min_value, + seq_list[i].gs_max_value, + seq_list[i].gs_init_value, + seq_list[i].gs_value, + seq_list[i].gs_state, + seq_list[i].gs_cycle, + seq_list[i].gs_called); } - elog(LOG, "Restoring sequences done."); + elog(DEBUG1, "Restoring sequences done."); return 1; } @@ -194,7 +191,7 @@ gtm_standby_restore_gxid(void) GTM_RWLockRelease(>MTransactions.gt_TransArrayLock); GTM_RWLockRelease(>MTransactions.gt_XidGenLock); - elog(LOG, "Restoring %d gxid(s) done.", num_txn); + elog(DEBUG1, "Restoring %d gxid(s) done.", num_txn); return 1; } @@ -222,7 +219,7 @@ gtm_standby_restore_node(void) for (i = 0; i < num_node; i++) { - elog(LOG, "get_node_list: nodetype=%d, nodename=%s, datafolder=%s", + elog(DEBUG1, "get_node_list: nodetype=%d, nodename=%s, datafolder=%s", data[i].type, data[i].nodename, data[i].datafolder); if (Recovery_PGXCNodeRegister(data[i].type, data[i].nodename, data[i].port, data[i].proxyname, data[i].status, @@ -254,7 +251,7 @@ gtm_standby_register_self(const char *node_name, int port, const char *datadir) { int rc; - elog(LOG, "Registering standby-GTM status..."); + elog(DEBUG1, "Registering standby-GTM status..."); node_get_local_addr(GTM_ActiveConn, standbyHostName, sizeof(standbyNodeName), &rc); if (rc != 0) @@ -269,11 +266,11 @@ gtm_standby_register_self(const char *node_name, int port, const char *datadir) standbyNodeName, standbyDataDir, NODE_DISCONNECTED); if (rc < 0) { - elog(LOG, "Failed to register a standby-GTM status."); + elog(DEBUG1, "Failed to register a standby-GTM status."); return 0; } - elog(LOG, "Registering standby-GTM done."); + elog(DEBUG1, "Registering standby-GTM done."); return 1; } @@ -288,12 +285,12 @@ gtm_standby_activate_self(void) { int rc; - elog(LOG, "Updating the standby-GTM status to \"CONNECTED\"..."); + elog(DEBUG1, "Updating the standby-GTM status to \"CONNECTED\"..."); rc = node_unregister(GTM_ActiveConn, GTM_NODE_GTM, standbyNodeName); if (rc < 0) { - elog(LOG, "Failed to unregister old standby-GTM status."); + elog(DEBUG1, "Failed to unregister old standby-GTM status."); return 0; } @@ -302,11 +299,11 @@ gtm_standby_activate_self(void) if (rc < 0) { - elog(LOG, "Failed to register a new standby-GTM status."); + elog(DEBUG1, "Failed to register a new standby-GTM status."); return 0; } - elog(LOG, "Updating the standby-GTM status done."); + elog(DEBUG1, "Updating the standby-GTM status done."); return 1; } @@ -329,7 +326,7 @@ find_standby_node_info(void) for (i = 0 ; i < n ; i++) { - elog(LOG, "pgxcnode_find_by_type: nodename=%s, type=%d, ipaddress=%s, port=%d, status=%d", + elog(DEBUG1, "pgxcnode_find_by_type: nodename=%s, type=%d, ipaddress=%s, port=%d, status=%d", node[i]->nodename, node[i]->type, node[i]->ipaddress, @@ -378,11 +375,11 @@ gtm_standby_connect_to_standby_int(int *report_needed) if (!n) { - elog(LOG, "Any GTM standby node not found in registered node(s)."); + elog(DEBUG1, "Any GTM standby node not found in registered node(s)."); return NULL; } - elog(LOG, "GTM standby is active. Going to connect."); + elog(DEBUG1, "GTM standby is active. Going to connect."); *report_needed = 1; snprintf(conn_string, sizeof(conn_string), @@ -393,11 +390,11 @@ gtm_standby_connect_to_standby_int(int *report_needed) if ( !standby ) { - elog(LOG, "Failed to establish a connection with GTM standby. - %p", n); + elog(DEBUG1, "Failed to establish a connection with GTM standby. - %p", n); return NULL; } - elog(LOG, "Connection established with GTM standby. - %p", n); + elog(DEBUG1, "Connection established with GTM standby. - %p", n); return standby; } @@ -427,13 +424,13 @@ gtm_standby_reconnect_to_standby(GTM_Conn *old_conn, int retry_max) for (i = 0; i < retry_max; i++) { - elog(LOG, "gtm_standby_reconnect_to_standby(): going to re-connect. retry=%d", i); + elog(DEBUG1, "gtm_standby_reconnect_to_standby(): going to re-connect. retry=%d", i); newconn = gtm_standby_connect_to_standby_int(&report); if (newconn != NULL) break; - elog(LOG, "gtm_standby_reconnect_to_standby(): re-connect failed. retry=%d", i); + elog(DEBUG1, "gtm_standby_reconnect_to_standby(): re-connect failed. retry=%d", i); } return newconn; @@ -465,7 +462,7 @@ gtm_standby_check_communication_error(int *retry_count, GTM_Conn *oldconn) return true; } - elog(LOG, "communication error with standby."); + elog(DEBUG1, "communication error with standby."); } return false; } @@ -495,7 +492,7 @@ gtm_standby_finishActiveConn(void) elog(DEBUG3, "Error in connection"); return; } - elog(LOG, "Connection established to the GTM active."); + elog(DEBUG1, "Connection established to the GTM active."); /* Unregister self from Active-GTM */ node_unregister(GTM_ActiveConn, GTM_NODE_GTM, NodeName); @@ -518,47 +515,3 @@ gtm_standby_connectToActiveGTM(void) return PQconnectGTM(connect_string); } - -void -ProcessGTMBeginBackup(Port *myport, StringInfo message) -{ - int ii; - GTM_ThreadInfo *my_threadinfo; - StringInfoData buf; - - pq_getmsgend(message); - my_threadinfo = GetMyThreadInfo; - - for (ii = 0; ii < GTMThreads->gt_array_size; ii++) - { - if (GTMThreads->gt_threads[ii] && GTMThreads->gt_threads[ii] != my_threadinfo) - GTM_RWLockAcquire(>MThreads->gt_threads[ii]->thr_lock, GTM_LOCKMODE_WRITE); - } - my_threadinfo->thr_status = GTM_THREAD_BACKUP; - pq_beginmessage(&buf, 'S'); - pq_sendint(&buf, BEGIN_BACKUP_RESULT, 4); - pq_endmessage(myport, &buf); - pq_flush(myport); -} - -void -ProcessGTMEndBackup(Port *myport, StringInfo message) -{ - int ii; - GTM_ThreadInfo *my_threadinfo; - StringInfoData buf; - - pq_getmsgend(message); - my_threadinfo = GetMyThreadInfo; - - for (ii = 0; ii < GTMThreads->gt_array_size; ii++) - { - if (GTMThreads->gt_threads[ii] && GTMThreads->gt_threads[ii] != my_threadinfo) - GTM_RWLockRelease(>MThreads->gt_threads[ii]->thr_lock); - } - my_threadinfo->thr_status = GTM_THREAD_RUNNING; - pq_beginmessage(&buf, 'S'); - pq_sendint(&buf, END_BACKUP_RESULT, 4); - pq_endmessage(myport, &buf); - pq_flush(myport); -} diff --git a/src/gtm/main/gtm_thread.c b/src/gtm/main/gtm_thread.c index d0329a8169..4612023ab8 100644 --- a/src/gtm/main/gtm_thread.c +++ b/src/gtm/main/gtm_thread.c @@ -262,7 +262,7 @@ GTM_ThreadCleanup(void *argp) { GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp; - elog(LOG, "Cleaning up thread state"); + elog(DEBUG1, "Cleaning up thread state"); if (thrinfo->thr_status == GTM_THREAD_BACKUP) { @@ -280,7 +280,7 @@ GTM_ThreadCleanup(void *argp) */ if (thrinfo->thr_conn->standby) { - elog(LOG, "Closing a connection to the GTM standby."); + elog(DEBUG1, "Closing a connection to the GTM standby."); GTMPQfinish(thrinfo->thr_conn->standby); thrinfo->thr_conn->standby = NULL; @@ -291,6 +291,14 @@ GTM_ThreadCleanup(void *argp) */ StreamClose(thrinfo->thr_conn->con_port->sock); + /* Free the node_name in the port */ + if (thrinfo->thr_conn->con_port->node_name != NULL) + /* + * We don't have to reset pointer to NULL her because ConnFree() + * frees this structure next. + */ + pfree(thrinfo->thr_conn->con_port->node_name); + /* Free the port */ ConnFree(thrinfo->thr_conn->con_port); thrinfo->thr_conn->con_port = NULL; diff --git a/src/gtm/main/gtm_txn.c b/src/gtm/main/gtm_txn.c index 900569e510..475a861458 100644 --- a/src/gtm/main/gtm_txn.c +++ b/src/gtm/main/gtm_txn.c @@ -3,6 +3,11 @@ * gtm_txn.c * Transaction handling * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -39,6 +44,11 @@ static void init_GTM_TransactionInfo(GTM_TransactionInfo *gtm_txninfo, GTM_IsolationLevel isolevel, GTMProxy_ConnID connid, bool readonly); +static void clean_GTM_TransactionInfo(GTM_TransactionInfo *gtm_txninfo); + +#ifdef XCP +GlobalTransactionId ControlXid; /* last one written to control file */ +#endif GTM_Transactions GTMTransactions; void @@ -100,6 +110,10 @@ GTM_InitTxnManager(void) GTMTransactions.gt_gtm_state = GTM_STARTING; +#ifdef XCP + ControlXid = FirstNormalGlobalTransactionId; +#endif + return; } @@ -155,7 +169,12 @@ GTM_GXIDToHandle(GlobalTransactionId gxid) if (gtm_txninfo != NULL) return gtm_txninfo->gti_handle; else + { + ereport(WARNING, + (ERANGE, errmsg("No transaction handle for gxid: %d", + gxid))); return InvalidTransactionHandle; + } } /* @@ -259,21 +278,7 @@ GTM_RemoveTransInfoMulti(GTM_TransactionInfo *gtm_txninfo[], int txn_count) /* * Now mark the transaction as aborted and mark the structure as not-in-use */ - gtm_txninfo[ii]->gti_state = GTM_TXN_ABORTED; - gtm_txninfo[ii]->gti_in_use = false; - gtm_txninfo[ii]->gti_snapshot_set = false; - - /* Clean-up also structures that were used for prepared transactions */ - if (gtm_txninfo[ii]->gti_gid) - { - pfree(gtm_txninfo[ii]->gti_gid); - gtm_txninfo[ii]->gti_gid = NULL; - } - if (gtm_txninfo[ii]->nodestring) - { - pfree(gtm_txninfo[ii]->nodestring); - gtm_txninfo[ii]->nodestring = NULL; - } + clean_GTM_TransactionInfo(gtm_txninfo[ii]); } GTM_RWLockRelease(>MTransactions.gt_TransArrayLock); @@ -328,20 +333,7 @@ GTM_RemoveAllTransInfos(int backend_id) /* * Now mark the transaction as aborted and mark the structure as not-in-use */ - gtm_txninfo->gti_state = GTM_TXN_ABORTED; - gtm_txninfo->gti_in_use = false; - gtm_txninfo->gti_snapshot_set = false; - - if (gtm_txninfo->gti_gid) - { - pfree(gtm_txninfo->gti_gid); - gtm_txninfo->gti_gid = NULL; - } - if (gtm_txninfo->nodestring) - { - pfree(gtm_txninfo->nodestring); - gtm_txninfo->nodestring = NULL; - } + clean_GTM_TransactionInfo(gtm_txninfo); /* move to next cell in the list */ if (prev) @@ -504,6 +496,9 @@ GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count) GlobalTransactionId xid, start_xid = InvalidGlobalTransactionId; GTM_TransactionInfo *gtm_txninfo = NULL; int ii; +#ifdef XCP + bool save_control = false; +#endif if (Recovery_IsStandby()) { @@ -578,12 +573,29 @@ GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count) gtm_txninfo = GTM_HandleToTransactionInfo(handle[ii]); Assert(gtm_txninfo); - elog(LOG, "Assigning new transaction ID = %d", xid); + elog(INFO, "Assigning new transaction ID = %d", xid); gtm_txninfo->gti_gxid = xid; } +#ifdef XCP + /* Periodically write the xid and sequence info out to the control file. + * Try and handle wrapping, too. + */ + if (xid - ControlXid > CONTROL_INTERVAL || xid < ControlXid) + { + save_control = true; + ControlXid = xid; + } +#endif + GTM_RWLockRelease(>MTransactions.gt_XidGenLock); +#ifdef XCP + /* save control info when not holding the XidGenLock */ + if (save_control) + SaveControlInfo(); +#endif + return start_xid; } @@ -659,6 +671,8 @@ GTM_BeginTransactionMulti(char *coord_name, */ oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + GTM_RWLockAcquire(>MTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE); + for (kk = 0; kk < txn_count; kk++) { int ii, jj, startslot; @@ -667,8 +681,6 @@ GTM_BeginTransactionMulti(char *coord_name, * We had no cached slots. Now find a free slot in the transation array * and store the transaction info structure there */ - GTM_RWLockAcquire(>MTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE); - startslot = GTMTransactions.gt_lastslot + 1; if (startslot >= GTM_MAX_GLOBAL_TRANSACTIONS) startslot = 0; @@ -737,7 +749,7 @@ init_GTM_TransactionInfo(GTM_TransactionInfo *gtm_txninfo, gtm_txninfo->gti_gxid = InvalidGlobalTransactionId; gtm_txninfo->gti_xmin = InvalidGlobalTransactionId; gtm_txninfo->gti_state = GTM_TXN_STARTING; - gtm_txninfo->gti_coordname = pstrdup(coord_name); + gtm_txninfo->gti_coordname = (coord_name ? pstrdup(coord_name) : NULL); gtm_txninfo->gti_isolevel = isolevel; gtm_txninfo->gti_readonly = readonly; @@ -753,6 +765,35 @@ init_GTM_TransactionInfo(GTM_TransactionInfo *gtm_txninfo, } +/* + * Clean up the TransactionInfo slot and pfree all the palloc'ed memory, + * except txid array of the snapshot, which is reused. + */ +static void +clean_GTM_TransactionInfo(GTM_TransactionInfo *gtm_txninfo) +{ + gtm_txninfo->gti_state = GTM_TXN_ABORTED; + gtm_txninfo->gti_in_use = false; + gtm_txninfo->gti_snapshot_set = false; + + if (gtm_txninfo->gti_coordname) + { + pfree(gtm_txninfo->gti_coordname); + gtm_txninfo->gti_coordname = NULL; + } + if (gtm_txninfo->gti_gid) + { + pfree(gtm_txninfo->gti_gid); + gtm_txninfo->gti_gid = NULL; + } + if (gtm_txninfo->nodestring) + { + pfree(gtm_txninfo->nodestring); + gtm_txninfo->nodestring = NULL; + } +} + + void GTM_BkupBeginTransactionMulti(char *coord_name, GTM_TransactionHandle *txn, @@ -1169,7 +1210,7 @@ ProcessBeginTransactionGetGXIDCommand(Port *myport, StringInfo message) MemoryContextSwitchTo(oldContext); - elog(LOG, "Sending transaction id %u", gxid); + elog(DEBUG1, "Sending transaction id %u", gxid); /* Backup first */ if (GetMyThreadInfo->thr_conn->standby) @@ -1177,7 +1218,7 @@ ProcessBeginTransactionGetGXIDCommand(Port *myport, StringInfo message) GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling begin_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); + elog(DEBUG1, "calling begin_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: bkup_begin_transaction_gxid(GetMyThreadInfo->thr_conn->standby, @@ -1229,7 +1270,13 @@ GTM_BkupBeginTransactionGetGXIDMulti(char *coord_name, int ii; MemoryContext oldContext; - oldContext = MemoryContextSwitchTo(TopMemoryContext); +#ifdef XCP + bool save_control = false; + GlobalTransactionId xid; +#endif + + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + //XCPTODO check oldContext = MemoryContextSwitchTo(TopMemoryContext); GTM_RWLockAcquire(>MTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE); for (ii = 0; ii < txn_count; ii++) @@ -1254,10 +1301,29 @@ GTM_BkupBeginTransactionGetGXIDMulti(char *coord_name, if (!GlobalTransactionIdIsValid(GTMTransactions.gt_nextXid)) /* Handle wrap around too */ GTMTransactions.gt_nextXid = FirstNormalGlobalTransactionId; GTMTransactions.gt_open_transactions = gtm_lappend(GTMTransactions.gt_open_transactions, gtm_txninfo); + xid = GTMTransactions.gt_nextXid; } +#ifdef XCP + /* Periodically write the xid and sequence info out to the control file. + * Try and handle wrapping, too. + */ + if (xid - ControlXid > CONTROL_INTERVAL || xid < ControlXid) + { + save_control = true; + ControlXid = xid; + } +#endif + GTM_RWLockRelease(>MTransactions.gt_TransArrayLock); + +#ifdef XCP + /* save control info when not holding the XidGenLock */ + if (save_control) + SaveControlInfo(); +#endif + MemoryContextSwitchTo(oldContext); } @@ -1365,7 +1431,7 @@ ProcessBeginTransactionGetGXIDAutovacuumCommand(Port *myport, StringInfo message GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling begin_transaction_autovacuum() for standby GTM %p.", + elog(DEBUG1, "calling begin_transaction_autovacuum() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: @@ -1379,7 +1445,7 @@ ProcessBeginTransactionGetGXIDAutovacuumCommand(Port *myport, StringInfo message if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "begin_transaction_autovacuum() GXID=%d done.", _gxid); + elog(DEBUG1, "begin_transaction_autovacuum() GXID=%d done.", _gxid); } /* Respond to the client */ pq_beginmessage(&buf, 'S'); @@ -1439,9 +1505,9 @@ ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message) /* * Start a new transaction * - * XXX Port should contain Coordinator name - replace "" with that + * XXX Port should contain Coordinator name - replace NULL with that */ - count = GTM_BeginTransactionMulti("", txn_isolation_level, txn_read_only, txn_connid, + count = GTM_BeginTransactionMulti(NULL, txn_isolation_level, txn_read_only, txn_connid, txn_count, txn); if (count != txn_count) ereport(ERROR, @@ -1459,11 +1525,11 @@ ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message) /* GXID has been received, now it's time to get a GTM timestamp */ timestamp = GTM_TimestampGetCurrent(); - end_gxid = start_gxid + txn_count; + end_gxid = start_gxid + (txn_count - 1); if (end_gxid < start_gxid) end_gxid += FirstNormalGlobalTransactionId; - elog(LOG, "Sending transaction ids from %u to %u", start_gxid, end_gxid); + elog(DEBUG1, "Sending transaction ids from %u to %u", start_gxid, end_gxid); /* Backup first */ if (GetMyThreadInfo->thr_conn->standby) @@ -1472,7 +1538,7 @@ ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message) GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling begin_transaction_multi() for standby GTM %p.", + elog(DEBUG1, "calling begin_transaction_multi() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: @@ -1491,7 +1557,7 @@ retry: if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "begin_transaction_multi() rc=%d done.", _rc); + elog(DEBUG1, "begin_transaction_multi() rc=%d done.", _rc); } /* Respond to the client */ pq_beginmessage(&buf, 'S'); @@ -1589,7 +1655,7 @@ ProcessCommitTransactionCommand(Port *myport, StringInfo message, bool is_backup oldContext = MemoryContextSwitchTo(TopMemoryContext); - elog(LOG, "Committing transaction id %u", gxid); + elog(DEBUG1, "Committing transaction id %u", gxid); /* * Commit the transaction @@ -1609,7 +1675,7 @@ ProcessCommitTransactionCommand(Port *myport, StringInfo message, bool is_backup GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling commit_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); + elog(DEBUG1, "calling commit_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: _rc = bkup_commit_transaction(GetMyThreadInfo->thr_conn->standby, gxid); @@ -1621,7 +1687,7 @@ ProcessCommitTransactionCommand(Port *myport, StringInfo message, bool is_backup if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "commit_transaction() rc=%d done.", _rc); + elog(DEBUG1, "commit_transaction() rc=%d done.", _rc); } pq_beginmessage(&buf, 'S'); @@ -1697,7 +1763,7 @@ ProcessCommitPreparedTransactionCommand(Port *myport, StringInfo message, bool i oldContext = MemoryContextSwitchTo(TopMemoryContext); - elog(LOG, "Committing: prepared id %u and commit prepared id %u ", gxid[0], gxid[1]); + elog(DEBUG1, "Committing: prepared id %u and commit prepared id %u ", gxid[0], gxid[1]); /* * Commit the prepared transaction. @@ -1715,7 +1781,7 @@ ProcessCommitPreparedTransactionCommand(Port *myport, StringInfo message, bool i GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling commit_prepared_transaction() for standby GTM %p.", + elog(DEBUG1, "calling commit_prepared_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: @@ -1729,7 +1795,7 @@ ProcessCommitPreparedTransactionCommand(Port *myport, StringInfo message, bool i if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "commit_prepared_transaction() rc=%d done.", _rc); + elog(DEBUG1, "commit_prepared_transaction() rc=%d done.", _rc); } /* Respond to the client */ pq_beginmessage(&buf, 'S'); @@ -1858,7 +1924,7 @@ ProcessGetGIDDataTransactionCommand(Port *myport, StringInfo message) GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling get_gid_data() for standby GTM %p.", + elog(DEBUG1, "calling get_gid_data() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: @@ -1872,7 +1938,7 @@ retry: if (gtm_standby_check_communication_error(&count, oldconn)) goto retry; - elog(LOG, "get_gid_data() rc=%d done.", _rc); + elog(DEBUG1, "get_gid_data() rc=%d done.", _rc); } #endif @@ -1909,7 +1975,7 @@ ProcessGXIDListCommand(Port *myport, StringInfo message) actlen = gtm_serialize_transactions(>MTransactions, data, estlen); - elog(LOG, "gtm_serialize_transactions: estlen=%ld, actlen=%ld", estlen, actlen); + elog(DEBUG1, "gtm_serialize_transactions: estlen=%ld, actlen=%ld", estlen, actlen); GTM_RWLockRelease(>MTransactions.gt_XidGenLock); @@ -1935,10 +2001,10 @@ ProcessGXIDListCommand(Port *myport, StringInfo message) if (myport->remote_type != GTM_NODE_GTM_PROXY) { pq_flush(myport); - elog(LOG, "pq_flush()"); + elog(DEBUG1, "pq_flush()"); } - elog(LOG, "ProcessGXIDListCommand() ok. %ld bytes sent. len=%d", actlen, buf.len); + elog(DEBUG1, "ProcessGXIDListCommand() ok. %ld bytes sent. len=%d", actlen, buf.len); free(data); return; @@ -1986,7 +2052,7 @@ ProcessRollbackTransactionCommand(Port *myport, StringInfo message, bool is_back oldContext = MemoryContextSwitchTo(TopMemoryContext); - elog(LOG, "Cancelling transaction id %u", gxid); + elog(DEBUG1, "Cancelling transaction id %u", gxid); /* * Commit the transaction @@ -2003,7 +2069,7 @@ ProcessRollbackTransactionCommand(Port *myport, StringInfo message, bool is_back GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling abort_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); + elog(DEBUG1, "calling abort_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: bkup_abort_transaction(GetMyThreadInfo->thr_conn->standby, gxid); @@ -2015,7 +2081,7 @@ ProcessRollbackTransactionCommand(Port *myport, StringInfo message, bool is_back if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "abort_transaction() GXID=%d done.", gxid); + elog(DEBUG1, "abort_transaction() GXID=%d done.", gxid); } /* Respond to the client */ pq_beginmessage(&buf, 'S'); @@ -2108,7 +2174,7 @@ ProcessCommitTransactionCommandMulti(Port *myport, StringInfo message, bool is_b GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling commit_transaction_multi() for standby GTM %p.", + elog(DEBUG1, "calling commit_transaction_multi() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: @@ -2120,7 +2186,7 @@ ProcessCommitTransactionCommandMulti(Port *myport, StringInfo message, bool is_b if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "commit_transaction_multi() rc=%d done.", _rc); + elog(DEBUG1, "commit_transaction_multi() rc=%d done.", _rc); } /* Respond to the client */ pq_beginmessage(&buf, 'S'); @@ -2211,7 +2277,7 @@ ProcessRollbackTransactionCommandMulti(Port *myport, StringInfo message, bool is GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling abort_transaction_multi() for standby GTM %p.", + elog(DEBUG1, "calling abort_transaction_multi() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: @@ -2224,7 +2290,7 @@ ProcessRollbackTransactionCommandMulti(Port *myport, StringInfo message, bool is if (Backup_synchronously &&(myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "abort_transaction_multi() rc=%d done.", _rc); + elog(DEBUG1, "abort_transaction_multi() rc=%d done.", _rc); } /* Respond to the client */ pq_beginmessage(&buf, 'S'); @@ -2325,7 +2391,7 @@ ProcessStartPreparedTransactionCommand(Port *myport, StringInfo message, bool is GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling start_prepared_transaction() for standby GTM %p.", + elog(DEBUG1, "calling start_prepared_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: @@ -2340,7 +2406,7 @@ ProcessStartPreparedTransactionCommand(Port *myport, StringInfo message, bool is if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "start_prepared_transaction() rc=%d done.", _rc); + elog(DEBUG1, "start_prepared_transaction() rc=%d done.", _rc); } pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_START_PREPARED_RESULT, 4); @@ -2412,7 +2478,7 @@ ProcessPrepareTransactionCommand(Port *myport, StringInfo message, bool is_backu MemoryContextSwitchTo(oldContext); - elog(LOG, "Preparing transaction id %u", gxid); + elog(DEBUG1, "Preparing transaction id %u", gxid); if (!is_backup) { @@ -2422,7 +2488,7 @@ ProcessPrepareTransactionCommand(Port *myport, StringInfo message, bool is_backu GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "calling prepare_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); + elog(DEBUG1, "calling prepare_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: bkup_prepare_transaction(GetMyThreadInfo->thr_conn->standby, gxid); @@ -2434,7 +2500,7 @@ ProcessPrepareTransactionCommand(Port *myport, StringInfo message, bool is_backu if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); - elog(LOG, "prepare_transaction() GXID=%d done.", gxid); + elog(DEBUG1, "prepare_transaction() GXID=%d done.", gxid); } /* Respond to the client */ pq_beginmessage(&buf, 'S'); @@ -2591,7 +2657,15 @@ GTM_RestoreTxnInfo(FILE *ctlf, GlobalTransactionId next_gxid) (!GlobalTransactionIdIsValid(next_gxid))) next_gxid = InitialGXIDValue_Default; else if (!GlobalTransactionIdIsValid(next_gxid)) +#ifdef XCP + { + /* Add in extra amount in case we had not gracefully stopped */ + next_gxid = saved_gxid + CONTROL_INTERVAL; + ControlXid = next_gxid; + } +#else next_gxid = saved_gxid; +#endif } else if (!GlobalTransactionIdIsValid(next_gxid)) next_gxid = InitialGXIDValue_Default; diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index e03738725c..2205beb055 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -2,6 +2,11 @@ * * main.c * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -72,6 +77,11 @@ int tcp_keepalives_count; char *error_reporter; char *status_reader; bool isStartUp; +#ifdef XCP +GTM_MutexLock control_lock; +char GTMControlFileTmp[GTM_MAX_PATH]; +#define GTM_CONTROL_FILE_TMP "gtm.control.tmp" +#endif /* If this is GTM or not */ /* @@ -182,6 +192,9 @@ BaseInit() CreateDataDirLockFile(); sprintf(GTMControlFile, "%s/%s", GTMDataDir, GTM_CONTROL_FILE); +#ifdef XCP + sprintf(GTMControlFileTmp, "%s/%s", GTMDataDir, GTM_CONTROL_FILE_TMP); +#endif if (GTMLogFile == NULL) { GTMLogFile = (char *) malloc(GTM_MAX_PATH); @@ -206,6 +219,9 @@ BaseInit() fflush(stdout); fflush(stderr); } +#ifdef XCP + GTM_MutexLockInit(&control_lock); +#endif } static void @@ -264,7 +280,6 @@ help(const char *progname) printf(_(" -D directory GTM working directory\n")); printf(_(" -l filename GTM server log file name \n")); printf(_(" -c show server status, then exit\n")); - printf(_(" -V, --version output version information, then exit\n")); printf(_(" --help show this help, then exit\n")); printf(_("\n")); printf(_("Options for Standby mode:\n")); @@ -281,6 +296,38 @@ gtm_status() exit(0); } +#ifdef XCP +/* + * Save control file info + */ +void +SaveControlInfo(void) +{ + FILE *ctlf; + + GTM_MutexLockAcquire(&control_lock); + + ctlf = fopen(GTMControlFileTmp, "w"); + + if (ctlf == NULL) + { + fprintf(stderr, "Failed to create/open the control file\n"); + fclose(ctlf); + GTM_MutexLockRelease(&control_lock); + return; + } + + GTM_SaveTxnInfo(ctlf); + GTM_SaveSeqInfo(ctlf); + fclose(ctlf); + + remove(GTMControlFile); + rename(GTMControlFileTmp, GTMControlFile); + + GTM_MutexLockRelease(&control_lock); +} +#endif + int main(int argc, char *argv[]) { @@ -339,11 +386,6 @@ main(int argc, char *argv[]) help(argv[0]); exit(0); } - if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) - { - puts("gtm (Postgres-XC) " PGXC_VERSION); - exit(0); - } } ListenAddresses = strdup(GTM_DEFAULT_HOSTNAME); @@ -591,11 +633,19 @@ main(int argc, char *argv[]) } else { +#ifdef XCP + GTM_MutexLockAcquire(&control_lock); +#endif + ctlf = fopen(GTMControlFile, "r"); GTM_RestoreTxnInfo(ctlf, next_gxid); GTM_RestoreSeqInfo(ctlf); if (ctlf) fclose(ctlf); + +#ifdef XCP + GTM_MutexLockRelease(&control_lock); +#endif } if (Recovery_IsStandby()) @@ -692,13 +742,13 @@ main(int argc, char *argv[]) elog(ERROR, "Failed to update the standby-GTM status as \"CONNECTED\"."); exit(1); } - elog(LOG, "Updating the standby-GTM status as \"CONNECTED\" succeeded."); + elog(DEBUG1, "Updating the standby-GTM status as \"CONNECTED\" succeeded."); if (!gtm_standby_finish_startup()) { elog(ERROR, "Failed to close the initial connection to the active-GTM."); exit(1); } - elog(LOG, "Startup connection with the active-GTM closed."); + elog(DEBUG1, "Startup connection with the active-GTM closed."); } /* @@ -782,7 +832,9 @@ ServerLoop(void) if (GTMAbortPending) { +#ifndef XCP FILE *ctlf; +#endif /* * XXX We should do a clean shutdown here. For the time being, just @@ -797,6 +849,9 @@ ServerLoop(void) */ GTM_SetShuttingDown(); +#ifdef XCP + SaveControlInfo(); +#else ctlf = fopen(GTMControlFile, "w"); if (ctlf == NULL) { @@ -806,6 +861,7 @@ ServerLoop(void) GTM_SaveTxnInfo(ctlf); GTM_SaveSeqInfo(ctlf); +#endif #if 0 /* @@ -821,7 +877,9 @@ ServerLoop(void) } #endif +#ifndef XCP fclose(ctlf); +#endif exit(1); } @@ -1002,6 +1060,7 @@ GTM_ThreadMain(void *argp) pq_getmsgend(&inBuf); GTM_RegisterPGXCNode(thrinfo->thr_conn->con_port, sp.sp_node_name); + thrinfo->thr_conn->con_port->remote_type = sp.sp_remotetype; thrinfo->thr_conn->con_port->is_postmaster = sp.sp_ispostmaster; } @@ -1213,6 +1272,9 @@ ProcessCommand(Port *myport, StringInfo input_message) case MSG_NODE_UNREGISTER: case MSG_BKUP_NODE_UNREGISTER: case MSG_NODE_LIST: +#ifdef XCP + case MSG_REGISTER_SESSION: +#endif ProcessPGXCNodeCommand(myport, mtype, input_message); break; case MSG_BEGIN_BACKUP: @@ -1260,6 +1322,7 @@ ProcessCommand(Port *myport, StringInfo input_message) case MSG_SEQUENCE_INIT: case MSG_BKUP_SEQUENCE_INIT: + case MSG_SEQUENCE_GET_CURRENT: case MSG_SEQUENCE_GET_NEXT: case MSG_BKUP_SEQUENCE_GET_NEXT: case MSG_SEQUENCE_GET_LAST: @@ -1449,6 +1512,12 @@ ProcessPGXCNodeCommand(Port *myport, GTM_MessageType mtype, StringInfo message) ProcessPGXCNodeList(myport, message); break; +#ifdef XCP + case MSG_REGISTER_SESSION: + ProcessPGXCRegisterSession(myport, message); + break; +#endif + default: Assert(0); /* Shouldn't come here.. keep compiler quite */ } @@ -1483,6 +1552,7 @@ ProcessTransactionCommand(Port *myport, GTM_MessageType mtype, StringInfo messag case MSG_BKUP_TXN_BEGIN_GETGXID: ProcessBkupBeginTransactionGetGXIDCommand(myport, message); + break; case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: ProcessBeginTransactionGetGXIDAutovacuumCommand(myport, message); @@ -1627,6 +1697,10 @@ ProcessSequenceCommand(Port *myport, GTM_MessageType mtype, StringInfo message) ProcessSequenceAlterCommand(myport, message, true); break; + case MSG_SEQUENCE_GET_CURRENT: + ProcessSequenceGetCurrentCommand(myport, message); + break; + case MSG_SEQUENCE_GET_NEXT: ProcessSequenceGetNextCommand(myport, message, false); break; @@ -1692,13 +1766,15 @@ ProcessQueryCommand(Port *myport, GTM_MessageType mtype, StringInfo message) } + static void GTM_RegisterPGXCNode(Port *myport, char *PGXCNodeName) { elog(DEBUG3, "Registering coordinator with name %s", PGXCNodeName); - myport->node_name = strdup(PGXCNodeName); + myport->node_name = pstrdup(PGXCNodeName); } + /* * Validate the proposed data directory */ diff --git a/src/gtm/path/Makefile b/src/gtm/path/Makefile index 4e6a159b19..186b3b1876 100644 --- a/src/gtm/path/Makefile +++ b/src/gtm/path/Makefile @@ -11,18 +11,21 @@ top_builddir=../../.. include $(top_builddir)/src/Makefile.global subdir=src/gtm/path -include $(top_srcdir)/src/backend/common.mk +NAME=gtmpath +SO_MAJOR_VERSION= 1 +SO_MINOR_VERSION= 0 -OBJS = path.o +OBJS=path.o -all: libgtmpath.a +all:all-lib -libgtmpath.a: $(OBJS) - $(AR) $(AROPT) $@ $^ +include $(top_srcdir)/src/Makefile.shlib clean: - rm -f $(OBJS) libgtmpath.a + rm -f $(OBJS) + rm -f libgtmpath.a libgtmpath.so libgtmpath.so.1 libgtmpath.so.1.0 distclean: clean maintainer-clean: distclean + diff --git a/src/gtm/proxy/Makefile b/src/gtm/proxy/Makefile index f6b0b1e335..c1ab2018d8 100644 --- a/src/gtm/proxy/Makefile +++ b/src/gtm/proxy/Makefile @@ -15,38 +15,23 @@ ifneq ($(PORTNAME), win32) override CFLAGS += $(PTHREAD_CFLAGS) endif -SUBDIRS = $(top_builddir)/src/gtm/client \ - $(top_builddir)/src/gtm/common \ - $(top_builddir)/src/gtm/config \ - $(top_builddir)/src/gtm/libpq \ - $(top_builddir)/src/gtm/path \ - $(top_builddir)/src/gtm/recovery +OBJS=proxy_main.o proxy_thread.o proxy_utils.o gtm_proxy_opt.o -include $(top_srcdir)/src/backend/common.mk +OTHERS= ../libpq/libpqcomm.a ../path/libgtmpath.a ../recovery/libgtmrecovery.a ../client/libgtmclient.a ../common/libgtm.a -OBJS = $(SUBDIROBJS) \ - $(top_builddir)/src/port/libpgport_srv.a \ - $(top_builddir)/src/port/pgsleep.o \ - proxy_main.o proxy_thread.o proxy_utils.o gtm_proxy_opt.o +LDFLAGS=-L$(top_builddir)/common -L$(top_builddir)/libpq -LIBS += $(PTHREAD_LIBS) +LIBS=-lpthread -all: gtm_proxy +gtm_proxy:$(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ $(OTHERS) ../../port/libpgport_srv.a -o gtm_proxy -gtm_proxy: $(OBJS) | submake-libpgport - $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $(call expand_subsys,$^) -o $@ +all:gtm_proxy -install: all installdirs - $(INSTALL_PROGRAM) gtm_proxy$(X) '$(DESTDIR)$(bindir)/gtm_proxy$(X)' - $(INSTALL_DATA) $(srcdir)/gtm_proxy.conf.sample '$(DESTDIR)$(datadir)/gtm_proxy.conf.sample' +clean: + rm -f $(OBJS) + rm -f gtm_proxy -installdirs: - $(MKDIR_P) '$(DESTDIR)$(bindir)' '$(DESTDIR)$(datadir)' +distclean: clean -uninstall: - rm -f '$(DESTDIR)$(bindir)/gtm_proxy$(X)' '$(DESTDIR)$(datadir)/gtm_proxy.conf.sample' - -clean distclean maintainer-clean: - rm -f gtm_proxy$(X) $(OBJS) - -$(top_builddir)/src/port/libpgport_srv.a: | submake-libpgport +maintainer-clean: distclean diff --git a/src/gtm/proxy/gtm_proxy_opt.c b/src/gtm/proxy/gtm_proxy_opt.c index 58e2beb7c2..96be9b56fc 100644 --- a/src/gtm/proxy/gtm_proxy_opt.c +++ b/src/gtm/proxy/gtm_proxy_opt.c @@ -158,7 +158,7 @@ struct config_int ConfigureNamesInt[] = 0 }, >MProxyPortNumber, - 6666, 0, INT_MAX, + 0, 0, INT_MAX, 0, NULL }, { @@ -169,7 +169,7 @@ struct config_int ConfigureNamesInt[] = 0 }, >MServerPortNumber, - 6666, 0, INT_MAX, + 0, 0, INT_MAX, 0, NULL }, { diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c index 0285f26d40..b3f4649cb4 100644 --- a/src/gtm/proxy/proxy_main.c +++ b/src/gtm/proxy/proxy_main.c @@ -2,6 +2,11 @@ * * proxy_main.c * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -177,7 +182,7 @@ static void SetDataDir(void); static void ChangeToDataDir(void); static void checkDataDir(void); static void DeleteLockFile(const char *filename); -static void RegisterProxy(bool is_reconnect, bool is_retry); +static void RegisterProxy(bool is_reconnect); static void UnregisterProxy(void); static GTM_Conn *ConnectGTM(void); static void ReleaseCmdBackup(GTMProxy_CommandInfo *cmdinfo); @@ -260,7 +265,7 @@ BaseInit() Recovery_SaveRegisterFileName(GTMProxyDataDir); /* Register Proxy on GTM */ - RegisterProxy(false, false); + RegisterProxy(false); DebugFileOpen(); @@ -395,7 +400,7 @@ GTMProxy_SigleHandler(int signal) { int ii; - elog(LOG, "Received signal %d\n", signal); + elog(DEBUG1, "Received signal %d\n", signal); switch (signal) { @@ -414,11 +419,11 @@ GTMProxy_SigleHandler(int signal) * The mask is set to block signals. They're blocked until all the * threads reconnect to the new GTM. */ - elog(LOG, "Accepted SIGUSR1\n"); + elog(DEBUG1, "Accepted SIGUSR1\n"); if (MyThreadID != TopMostThreadID) { - elog(LOG, "Not on main thread, proxy the signal to the main thread."); + elog(DEBUG1, "Not on main thread, proxy the signal to the main thread."); pthread_kill(TopMostThreadID, SIGUSR1); return; @@ -428,18 +433,18 @@ GTMProxy_SigleHandler(int signal) */ PG_SETMASK(&BlockSig); - elog(LOG, "I'm the main thread. Accepted SIGUSR1."); + elog(DEBUG1, "I'm the main thread. Accepted SIGUSR1."); /* * Set Reconnect Info */ if (!ReadyToReconnect) { - elog(LOG, "SIGUSR1 detected, but not ready to handle this. Ignored"); + elog(DEBUG1, "SIGUSR1 detected, but not ready to handle this. Ignored"); PG_SETMASK(&UnBlockSig); return; } - elog(LOG, "SIGUSR1 detected. Set reconnect info for each worker thread"); + elog(DEBUG1, "SIGUSR1 detected. Set reconnect info for each worker thread"); if (GTMProxy_ReadReconnectInfo() != 0) { /* Failed to read reconnect information from reconnect data file */ @@ -477,7 +482,7 @@ GTMProxy_SigleHandler(int signal) for (ii = 0; ii < GTMProxyWorkerThreads; ii++) pthread_kill(Proxy_ThreadInfo[ii]->thr_id, SIGUSR2); - elog(LOG, "SIGUSR2 issued to all the worker threads."); + elog(DEBUG1, "SIGUSR2 issued to all the worker threads."); PG_SETMASK(&UnBlockSig); /* @@ -490,13 +495,13 @@ GTMProxy_SigleHandler(int signal) /* Main thread has nothing to do twith this signal and should not receive this. */ PG_SETMASK(&BlockSig); - elog(LOG, "Detected SIGUSR2, thread:%ld", MyThreadID); + elog(DEBUG1, "Detected SIGUSR2, thread:%ld", MyThreadID); if (MyThreadID == TopMostThreadID) { /* This should not be reached. Just in case. */ - elog(LOG, "SIGUSR2 received by the main thread. Ignoring."); + elog(DEBUG1, "SIGUSR2 received by the main thread. Ignoring."); PG_SETMASK(&UnBlockSig); return; @@ -549,7 +554,6 @@ help(const char *progname) printf(_(" -n count Number of worker threads\n")); printf(_(" -D directory GTM proxy working directory\n")); printf(_(" -l filename GTM proxy log file name \n")); - printf(_(" -V, --version output version information, then exit\n")); printf(_(" --help show this help, then exit\n")); } @@ -590,11 +594,6 @@ main(int argc, char *argv[]) help(argv[0]); exit(0); } - if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) - { - puts("gtm_proxy (Postgres-XC) " PGXC_VERSION); - exit(0); - } } ListenAddresses = strdup(GTM_PROXY_DEFAULT_HOSTNAME); @@ -961,9 +960,9 @@ ServerLoop(void) * the resource but this may not happen so many times. */ - elog(LOG, "Main Thread reconnecting to new GTM."); - RegisterProxy(TRUE, false); - elog(LOG, "Reconnected."); + elog(DEBUG1, "Main Thread reconnecting to new GTM."); + RegisterProxy(TRUE); + elog(DEBUG1, "Reconnected."); /* If it is done, then release the lock for worker threads. */ GTM_RWLockRelease(&ReconnectControlLock); @@ -1015,7 +1014,7 @@ ServerLoop(void) { if (errno != EINTR && errno != EWOULDBLOCK) { - ereport(LOG, + ereport(DEBUG1, (EACCES, errmsg("select() failed in postmaster: %m"))); return STATUS_ERROR; @@ -1094,6 +1093,7 @@ GTMProxy_ThreadMain(void *argp) int ii, nrfds; char gtm_connect_string[1024]; int first_turn = TRUE; /* Used only to set longjmp target at the first turn of thread loop */ + GTMProxy_CommandData cmd_data = {}; elog(DEBUG3, "Starting the connection helper thread"); @@ -1348,7 +1348,6 @@ setjmp_again: /* * Correction of pending works. */ - thrinfo->thr_processed_commands = gtm_NIL; for (ii = 0; ii < MSG_TYPE_COUNT; ii++) { thrinfo->thr_pending_commands[ii] = gtm_NIL; @@ -1387,7 +1386,8 @@ setjmp_again: * to the remove_list and cleanup at the end of this round of * cleanup. */ - GTMProxy_HandleDisconnect(thrinfo->thr_conn, thrinfo->thr_gtm_conn); + GTMProxy_CommandPending(thrinfo->thr_conn, + MSG_BACKEND_DISCONNECT, cmd_data); continue; } @@ -1421,7 +1421,8 @@ setjmp_again: * to the server to quickly find the backend connection * while processing proxied messages. */ - GTMProxy_HandleDisconnect(thrinfo->thr_conn, thrinfo->thr_gtm_conn); + GTMProxy_CommandPending(thrinfo->thr_conn, + MSG_BACKEND_DISCONNECT, cmd_data); break; default: /* @@ -1565,6 +1566,9 @@ ProcessCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, { case MSG_NODE_REGISTER: case MSG_NODE_UNREGISTER: +#ifdef XCP + case MSG_REGISTER_SESSION: +#endif ProcessPGXCNodeCommand(conninfo, gtm_conn, mtype, input_message); break; @@ -1587,6 +1591,7 @@ ProcessCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, break; case MSG_SEQUENCE_INIT: + case MSG_SEQUENCE_GET_CURRENT: case MSG_SEQUENCE_GET_NEXT: case MSG_SEQUENCE_GET_LAST: case MSG_SEQUENCE_SET_VAL: @@ -1882,8 +1887,12 @@ ProcessResponse(GTMProxy_ThreadInfo *thrinfo, GTMProxy_CommandInfo *cmdinfo, case MSG_TXN_GET_GID_DATA: case MSG_NODE_REGISTER: case MSG_NODE_UNREGISTER: +#ifdef XCP + case MSG_REGISTER_SESSION: +#endif case MSG_SNAPSHOT_GXID_GET: case MSG_SEQUENCE_INIT: + case MSG_SEQUENCE_GET_CURRENT: case MSG_SEQUENCE_GET_NEXT: case MSG_SEQUENCE_GET_LAST: case MSG_SEQUENCE_SET_VAL: @@ -2160,6 +2169,16 @@ ProcessPGXCNodeCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, /* Unregistering has to be saved in a place where it can be seen by all the threads */ oldContext = MemoryContextSwitchTo(TopMostMemoryContext); +#ifdef XCP + /* + * Unregister node. Ignore any error here, otherwise we enter + * endless loop trying to execute command again and again + */ + Recovery_PGXCNodeUnregister(cmd_data.cd_reg.type, + cmd_data.cd_reg.nodename, + false, + conninfo->con_port->sock); +#else /* Unregister Node also on Proxy */ if (Recovery_PGXCNodeUnregister(cmd_data.cd_reg.type, cmd_data.cd_reg.nodename, @@ -2170,12 +2189,17 @@ ProcessPGXCNodeCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, (EINVAL, errmsg("Failed to Unregister node"))); } - +#endif MemoryContextSwitchTo(oldContext); GTMProxy_ProxyPGXCNodeCommand(conninfo, gtm_conn, mtype, cmd_data); break; } +#ifdef XCP + case MSG_REGISTER_SESSION: + GTMProxy_ProxyCommand(conninfo, gtm_conn, mtype, message); + break; +#endif default: Assert(0); /* Shouldn't come here.. Keep compiler quiet */ } @@ -2439,6 +2463,10 @@ GTMProxy_CommandPending(GTMProxy_ConnectionInfo *conninfo, GTM_MessageType mtype GTMProxy_CommandInfo *cmdinfo; GTMProxy_ThreadInfo *thrinfo = GetMyThreadInfo; +#ifdef XCP + MemoryContext oldContext = MemoryContextSwitchTo(TopMemoryContext); +#endif + /* * Add the message to the pending command list */ @@ -2449,6 +2477,10 @@ GTMProxy_CommandPending(GTMProxy_ConnectionInfo *conninfo, GTM_MessageType mtype cmdinfo->ci_data = cmd_data; thrinfo->thr_pending_commands[mtype] = gtm_lappend(thrinfo->thr_pending_commands[mtype], cmdinfo); +#ifdef XCP + MemoryContextSwitchTo(oldContext); +#endif + return; } @@ -2529,7 +2561,8 @@ GTMProxy_HandleDisconnect(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn) /* Mark node as disconnected if it is a postmaster backend */ Recovery_PGXCNodeDisconnect(conninfo->con_port); - /* Start the message. */ + proxyhdr.ph_conid = conninfo->con_id; + /* Start the message. */ if (gtmpqPutMsgStart('C', true, gtm_conn) || gtmpqPutnchar((char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader), gtm_conn) || gtmpqPutInt(MSG_BACKEND_DISCONNECT, sizeof (GTM_MessageType), gtm_conn) || @@ -2559,8 +2592,6 @@ GTMProxy_HandleDisconnect(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn) ConnFree(conninfo->con_port); conninfo->con_port = NULL; - proxyhdr.ph_conid = conninfo->con_id; - return; } @@ -2580,7 +2611,9 @@ GTMProxy_ProcessPendingCommands(GTMProxy_ThreadInfo *thrinfo) { int res_index = 0; - if (gtm_list_length(thrinfo->thr_pending_commands[ii]) == 0) + /* We process backend disconnects last! */ + if (ii == MSG_BACKEND_DISCONNECT || + gtm_list_length(thrinfo->thr_pending_commands[ii]) == 0) continue; /* @@ -2755,7 +2788,15 @@ GTMProxy_ProcessPendingCommands(GTMProxy_ThreadInfo *thrinfo) default: elog(ERROR, "This message type (%d) can not be grouped together", ii); } - + } + /* Process backend disconnect messages now */ + gtm_foreach (elem, thrinfo->thr_pending_commands[MSG_BACKEND_DISCONNECT]) + { + ereport(COMMERROR, + (EPROTO, + errmsg("cleaning up client disconnection"))); + cmdinfo = (GTMProxy_CommandInfo *)gtm_lfirst(elem); + GTMProxy_HandleDisconnect(cmdinfo->ci_conn, gtm_conn); } } @@ -3151,7 +3192,7 @@ failed: * NewGTMServerPortNumber. */ static void -RegisterProxy(bool is_reconnect, bool is_retry) +RegisterProxy(bool is_reconnect) { GTM_PGXCNodeType type = GTM_NODE_GTM_PROXY; GTM_PGXCNodePort port = (GTM_PGXCNodePort) GTMProxyPortNumber; @@ -3236,14 +3277,7 @@ RegisterProxy(bool is_reconnect, bool is_retry) return; failed: - if (!is_retry) - { - elog(NOTICE, "could not register Proxy on GTM. Trying to unregister myself and then retry."); - UnregisterProxy(); - return RegisterProxy(is_reconnect, true); - } - else - elog(ERROR, "can not register Proxy on GTM"); + elog(ERROR, "can not register Proxy on GTM"); } static GTM_Conn* @@ -3299,26 +3333,18 @@ workerThreadReconnectToGTM(void) PG_SETMASK(&UnBlockSig); /* Disconnect the current connection and re-connect to the new GTM */ - /* - * Because some error is expected, it is harmful to close GTM connection in - * normal way. Instead, just close the socket to save kernel resource. - * - * This is error recovery and we should be very careful what structure is - * available. - */ oldContext = MemoryContextSwitchTo(TopMostMemoryContext); - if (GetMyThreadInfo && GetMyThreadInfo->thr_gtm_conn && GetMyThreadInfo->thr_gtm_conn->sock != -1) - StreamClose(GetMyThreadInfo->thr_gtm_conn->sock); - + if (GetMyThreadInfo->thr_gtm_conn) + GTMPQfinish(GetMyThreadInfo->thr_gtm_conn); sprintf(gtm_connect_string, "host=%s port=%d node_name=%s remote_type=%d", GTMServerHost, GTMServerPortNumber, GTMProxyNodeName, GTM_NODE_GTM_PROXY); - elog(LOG, "Worker thread connecting to %s", gtm_connect_string); + elog(DEBUG1, "Worker thread connecting to %s", gtm_connect_string); GetMyThreadInfo->thr_gtm_conn = PQconnectGTM(gtm_connect_string); if (GetMyThreadInfo->thr_gtm_conn == NULL) elog(FATAL, "Worker thread GTM connection failed."); - elog(LOG, "Worker thread connection done."); + elog(DEBUG1, "Worker thread connection done."); MemoryContextSwitchTo(oldContext); diff --git a/src/gtm/proxy/proxy_thread.c b/src/gtm/proxy/proxy_thread.c index ad8d155c4f..4247be2d69 100644 --- a/src/gtm/proxy/proxy_thread.c +++ b/src/gtm/proxy/proxy_thread.c @@ -254,7 +254,7 @@ GTMProxy_ThreadCleanup(void *argp) { GTMProxy_ThreadInfo *thrinfo = (GTMProxy_ThreadInfo *)argp; - elog(LOG, "Cleaning up thread state"); + elog(DEBUG1, "Cleaning up thread state"); /* * TODO Close the open connection. diff --git a/src/gtm/recovery/Makefile b/src/gtm/recovery/Makefile index f604d2bb65..e98e0f69fd 100644 --- a/src/gtm/recovery/Makefile +++ b/src/gtm/recovery/Makefile @@ -11,17 +11,21 @@ top_builddir=../../.. include $(top_builddir)/src/Makefile.global subdir=src/gtm/recovery -include $(top_srcdir)/src/backend/common.mk +NAME=gtmrecovery +SO_MAJOR_VERSION= 1 +SO_MINOR_VERSION= 0 -OBJS = register_common.o standby_utils.o +OBJS=register_common.o register_gtm.o replication.o standby_utils.o -all: libgtmrecovery.a +OTHERS=../client/libgtmclient.a -libgtmrecovery.a: $(OBJS) - $(AR) $(AROPT) $@ $^ +all:all-lib + +include $(top_srcdir)/src/Makefile.shlib clean: - rm -f $(OBJS) libgtmrecovery.a + rm -f $(OBJS) + rm -f libgtmrecovery.a libgtmrecovery.so libgtmrecovery.so.1 libgtmrecovery.so.1.0 distclean: clean diff --git a/src/gtm/recovery/register_common.c b/src/gtm/recovery/register_common.c index f8f98ec8ac..4de2299b47 100644 --- a/src/gtm/recovery/register_common.c +++ b/src/gtm/recovery/register_common.c @@ -3,6 +3,11 @@ * register.c * PGXC Node Register on GTM and GTM Proxy, node registering functions * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -31,6 +36,7 @@ #include "gtm/register.h" #include "gtm/gtm_ip.h" +#include "storage/backendid.h" #define GTM_NODE_FILE "register.node" #define NODE_HASH_TABLE_SIZE 16 @@ -120,7 +126,7 @@ pgxcnode_find_by_type(GTM_PGXCNodeType type, GTM_PGXCNodeInfo **data, size_t max if (cur != NULL && cur->type == type) { data[node] = cur; - elog(LOG, "pgxcnode_find_by_type: cur=%p, ipaddress=%s", cur, cur->ipaddress); + elog(DEBUG1, "pgxcnode_find_by_type: cur=%p, ipaddress=%s", cur, cur->ipaddress); node++; } @@ -349,8 +355,18 @@ Recovery_PGXCNodeUnregister(GTM_PGXCNodeType type, char *node_name, bool in_reco Recovery_RecordRegisterInfo(nodeinfo, false); pfree(nodeinfo->nodename); +#ifdef XCP + if (nodeinfo->ipaddress) +#endif pfree(nodeinfo->ipaddress); +#ifdef XCP + if (nodeinfo->datafolder) +#endif pfree(nodeinfo->datafolder); +#ifdef XCP + if (nodeinfo->sessions) + pfree(nodeinfo->sessions); +#endif pfree(nodeinfo); } else @@ -373,7 +389,11 @@ Recovery_PGXCNodeRegister(GTM_PGXCNodeType type, GTM_PGXCNodeInfo *nodeinfo = NULL; int errcode = 0; +#ifdef XCP + nodeinfo = (GTM_PGXCNodeInfo *) palloc0(sizeof(GTM_PGXCNodeInfo)); +#else nodeinfo = (GTM_PGXCNodeInfo *) palloc(sizeof (GTM_PGXCNodeInfo)); +#endif if (nodeinfo == NULL) ereport(ERROR, (ENOMEM, errmsg("Out of memory"))); @@ -394,10 +414,10 @@ Recovery_PGXCNodeRegister(GTM_PGXCNodeType type, nodeinfo->status = status; nodeinfo->socket = socket; - elog(LOG, "Recovery_PGXCNodeRegister Request info: type=%d, nodename=%s, port=%d," \ + elog(DEBUG1, "Recovery_PGXCNodeRegister Request info: type=%d, nodename=%s, port=%d," \ "datafolder=%s, ipaddress=%s, status=%d", type, nodename, port, datafolder, ipaddress, status); - elog(LOG, "Recovery_PGXCNodeRegister Node info: type=%d, nodename=%s, port=%d, "\ + elog(DEBUG1, "Recovery_PGXCNodeRegister Node info: type=%d, nodename=%s, port=%d, "\ "datafolder=%s, ipaddress=%s, status=%d", nodeinfo->type, nodeinfo->nodename, nodeinfo->port, nodeinfo->datafolder, nodeinfo->ipaddress, nodeinfo->status); @@ -633,6 +653,7 @@ Recovery_RecordRegisterInfo(GTM_PGXCNodeInfo *nodeinfo, bool is_register) void Recovery_RestoreRegisterInfo(void) { +#ifndef XCP int magic; int ctlfd; @@ -701,6 +722,7 @@ Recovery_RestoreRegisterInfo(void) } close(ctlfd); +#endif } void @@ -785,6 +807,94 @@ Recovery_PGXCNodeBackendDisconnect(GTM_PGXCNodeType type, char *nodename, int so return errcode; } + +#ifdef XCP +/* + * Register active distributed session. If another session with specified + * BackendId already exists return the PID of the session, so caller could clean + * it up. Otherwise return 0. + */ +int +Recovery_PGXCNodeRegisterCoordProcess(char *coord_node, int coord_procid, + int coord_backendid) +{ + GTM_PGXCNodeInfo *nodeinfo; + GTM_PGXCSession *session; + int i; + + /* + * Get the registration record for the coordinator node. If not specified, + * register it now. + */ + nodeinfo = pgxcnode_find_info(GTM_NODE_COORDINATOR, coord_node); + + if (nodeinfo == NULL) + { + if (Recovery_PGXCNodeRegister(GTM_NODE_COORDINATOR, coord_node, 0, NULL, + NODE_CONNECTED, NULL, NULL, false, 0)) + return 0; + + nodeinfo = pgxcnode_find_info(GTM_NODE_COORDINATOR, coord_node); + } + + /* Iterate over the existing sessions */ + GTM_RWLockAcquire(&nodeinfo->node_lock, GTM_LOCKMODE_WRITE); + for (i = 0; i < nodeinfo->num_sessions; i++) + { + if (nodeinfo->sessions[i].gps_coord_proc_id == coord_procid) + { + /* + * Already registered, nothing todo. + * May be session lost the GTM connection and now is reconnecting. + */ + GTM_RWLockRelease(&nodeinfo->node_lock); + return 0; + } + if (nodeinfo->sessions[i].gps_coord_backend_id == coord_backendid) + { + /* + * Reuse the entry and return PID of the previous session. + */ + int result = nodeinfo->sessions[i].gps_coord_proc_id; + elog(DEBUG1, "New session %s:%d with existing BackendId %d", + coord_node, coord_procid, coord_backendid); + nodeinfo->sessions[i].gps_coord_proc_id = coord_procid; + GTM_RWLockRelease(&nodeinfo->node_lock); + return result; + } + } + /* Session not found, populate new entry */ + elog(DEBUG1, "New session %s:%d with BackendId %d", + coord_node, coord_procid, coord_backendid); + if (nodeinfo->num_sessions == nodeinfo->max_sessions) + { + /* need to extend array */ +#define INIT_SESSIONS 256 + if (nodeinfo->max_sessions == 0) + { + nodeinfo->sessions = (GTM_PGXCSession *) + palloc(INIT_SESSIONS * sizeof(GTM_PGXCSession)); + nodeinfo->max_sessions = INIT_SESSIONS; + } + else + { + int newsize = nodeinfo->max_sessions * 2; + nodeinfo->sessions = (GTM_PGXCSession *) + repalloc(nodeinfo->sessions, + newsize * sizeof(GTM_PGXCSession)); + nodeinfo->max_sessions = newsize; + } + } + nodeinfo->sessions[nodeinfo->num_sessions].gps_coord_proc_id = coord_procid; + nodeinfo->sessions[nodeinfo->num_sessions].gps_coord_backend_id = coord_backendid; + nodeinfo->num_sessions++; + GTM_RWLockRelease(&nodeinfo->node_lock); + + return 0; +} +#endif + + /* * Process MSG_BACKEND_DISCONNECT * @@ -847,7 +957,7 @@ ProcessPGXCNodeBackendDisconnect(Port *myport, StringInfo message) GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; int count = 0; - elog(LOG, "forwarding MSG_BACKEND_DISCONNECT to standby GTM %p.", + elog(DEBUG1, "forwarding MSG_BACKEND_DISCONNECT to standby GTM %p.", GetMyThreadInfo->thr_conn->standby); retry: @@ -859,6 +969,6 @@ retry: if (gtm_standby_check_communication_error(&count, oldconn)) goto retry; - elog(LOG, "MSG_BACKEND_DISCONNECT rc=%d done.", _rc); + elog(DEBUG1, "MSG_BACKEND_DISCONNECT rc=%d done.", _rc); } } diff --git a/src/gtm/recovery/register_gtm.c b/src/gtm/recovery/register_gtm.c new file mode 100644 index 0000000000..bb7f433404 --- /dev/null +++ b/src/gtm/recovery/register_gtm.c @@ -0,0 +1,597 @@ +/*------------------------------------------------------------------------- + * + * register.c + * PGXC Node Register on GTM and GTM Proxy, node registering functions + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + +#include <fcntl.h> +#include <stdio.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "gtm/elog.h" +#include "gtm/gtm.h" +#include "gtm/gtm_client.h" +#include "gtm/gtm_serialize.h" +#include "gtm/gtm_standby.h" +#include "gtm/libpq.h" +#include "gtm/libpq-int.h" +#include "gtm/pqformat.h" +#include "gtm/stringinfo.h" +#include "gtm/register.h" + +#include "gtm/gtm_ip.h" + +#ifdef XCP +#include "storage/backendid.h" +#endif + +static void finishStandbyConn(GTM_ThreadInfo *thrinfo); +extern bool Backup_synchronously; + +/* + * Process MSG_NODE_REGISTER/MSG_BKUP_NODE_REGISTER message. + * + * is_backup indicates the message is MSG_BKUP_NODE_REGISTER. + */ +void +ProcessPGXCNodeRegister(Port *myport, StringInfo message, bool is_backup) +{ + GTM_PGXCNodeType type; + GTM_PGXCNodePort port; + char remote_host[NI_MAXHOST]; + char datafolder[NI_MAXHOST]; + char node_name[NI_MAXHOST]; + char proxyname[NI_MAXHOST]; + char *ipaddress; + MemoryContext oldContext; + int len; + StringInfoData buf; + GTM_PGXCNodeStatus status; + + /* Read Node Type */ + memcpy(&type, pq_getmsgbytes(message, sizeof (GTM_PGXCNodeType)), + sizeof (GTM_PGXCNodeType)); + + /* Read Node name */ + len = pq_getmsgint(message, sizeof (int)); + if (len >= NI_MAXHOST) + ereport(ERROR, + (EINVAL, + errmsg("Invalid name length."))); + + memcpy(node_name, (char *)pq_getmsgbytes(message, len), len); + node_name[len] = '\0'; + + /* Read Host name */ + len = pq_getmsgint(message, sizeof (int)); + memcpy(remote_host, (char *)pq_getmsgbytes(message, len), len); + remote_host[len] = '\0'; + ipaddress = remote_host; + + /* Read Port Number */ + memcpy(&port, pq_getmsgbytes(message, sizeof (GTM_PGXCNodePort)), + sizeof (GTM_PGXCNodePort)); + + /* Read Proxy name (empty string if no proxy used) */ + len = pq_getmsgint(message, sizeof (GTM_StrLen)); + if (len >= NI_MAXHOST) + ereport(ERROR, + (EINVAL, + errmsg("Invalid proxy name length."))); + memcpy(proxyname, (char *)pq_getmsgbytes(message, len), len); + proxyname[len] = '\0'; + + /* + * Finish by reading Data Folder (length and then string) + */ + len = pq_getmsgint(message, sizeof (GTM_StrLen)); + + memcpy(datafolder, (char *)pq_getmsgbytes(message, len), len); + datafolder[len] = '\0'; + + elog(DEBUG1, + "ProcessPGXCNodeRegister: ipaddress = \"%s\", node name = \"%s\", proxy name = \"%s\", " + "datafolder \"%s\"", + ipaddress, node_name, proxyname, datafolder); + + status = pq_getmsgint(message, sizeof (GTM_PGXCNodeStatus)); + + if ((type!=GTM_NODE_GTM_PROXY) && + (type!=GTM_NODE_GTM_PROXY_POSTMASTER) && + (type!=GTM_NODE_COORDINATOR) && + (type!=GTM_NODE_DATANODE) && + (type!=GTM_NODE_GTM) && + (type!=GTM_NODE_DEFAULT)) + ereport(ERROR, + (EINVAL, + errmsg("Unknown node type."))); + + elog(DEBUG1, "Node type = %d", type); + + /* + * We must use the TopMostMemoryContext because the Node ID information is + * not bound to a thread and can outlive any of the thread specific + * contextes. + */ + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + + /* + * We don't check if the this is not in standby mode to allow + * cascaded standby. + */ + if (type == GTM_NODE_GTM) + { + elog(DEBUG1, "Registering GTM (Standby). Unregister this first."); + /* + * There's another standby. May be failed one. + * Clean this up. This means that we allow + * only one standby at the same time. + * + * This helps to give up failed standby and connect + * new one, regardless how they stopped. + * + * Be sure that all ther threads are locked by other + * means, typically by receiving MSG_BEGIN_BACKUP. + * + * First try to unregister GTM which is now connected. We don't care + * if it failed. + */ + Recovery_PGXCNodeUnregister(type, node_name, false, -1); + /* + * Then disconnect the connections to the standby from each thread. + * Please note that we assume only one standby is allowed at the same time. + * Cascade standby may be allowed. + */ + GTM_DoForAllOtherThreads(finishStandbyConn); + + GTMThreads->gt_standby_ready = true; + } + + if (Recovery_PGXCNodeRegister(type, node_name, port, + proxyname, NODE_CONNECTED, + ipaddress, datafolder, false, myport->sock)) + { + ereport(ERROR, + (EINVAL, + errmsg("Failed to Register node"))); + } + + /* + * We don't check if the this is not in standby mode to allow + * cascaded standby. + */ + if (type == GTM_NODE_GTM) + GTMThreads->gt_standby_ready = true; + + MemoryContextSwitchTo(oldContext); + + pq_getmsgend(message); + + if (!is_backup) + { + /* + * Backup first + */ + if (GetMyThreadInfo->thr_conn->standby) + { + int _rc; + GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; + int count = 0; + GTM_PGXCNodeInfo *standbynode; + + elog(DEBUG1, "calling node_register_internal() for standby GTM %p.", + GetMyThreadInfo->thr_conn->standby); + + retry: + _rc = bkup_node_register_internal(GetMyThreadInfo->thr_conn->standby, + type, + ipaddress, + port, + node_name, + datafolder, + status); + + elog(DEBUG1, "node_register_internal() returns rc %d.", _rc); + + if (gtm_standby_check_communication_error(&count, oldconn)) + goto retry; + + /* Now check if there're other standby registered. */ + standbynode = find_standby_node_info(); + if (!standbynode) + GTMThreads->gt_standby_ready = false; + + if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) + gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); + + } + /* + * Then, send a SUCCESS message back to the client + */ + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, NODE_REGISTER_RESULT, 4); + if (myport->remote_type == GTM_NODE_GTM_PROXY) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&type, sizeof(GTM_PGXCNodeType)); + /* Node name length */ + pq_sendint(&buf, strlen(node_name), 4); + /* Node name (var-len) */ + pq_sendbytes(&buf, node_name, strlen(node_name)); + pq_endmessage(myport, &buf); + + if (myport->remote_type != GTM_NODE_GTM_PROXY) + { + if (GetMyThreadInfo->thr_conn->standby) + gtmpqFlush(GetMyThreadInfo->thr_conn->standby); + pq_flush(myport); + } + } +} + + +/* + * Process MSG_NODE_UNREGISTER/MSG_BKUP_NODE_UNREGISTER + * + * is_backup indiccates MSG_BKUP_NODE_UNREGISTER + */ +void +ProcessPGXCNodeUnregister(Port *myport, StringInfo message, bool is_backup) +{ + GTM_PGXCNodeType type; + MemoryContext oldContext; + StringInfoData buf; + int len; + char node_name[NI_MAXHOST]; + + /* Read Node Type and number */ + memcpy(&type, pq_getmsgbytes(message, sizeof (GTM_PGXCNodeType)), + sizeof (GTM_PGXCNodeType)); + + /* Read Node name */ + len = pq_getmsgint(message, sizeof (int)); + if (len >= NI_MAXHOST) + ereport(ERROR, + (EINVAL, + errmsg("Invalid node name length"))); + memcpy(node_name, (char *)pq_getmsgbytes(message, len), len); + node_name[len] = '\0'; + + /* + * We must use the TopMostMemoryContext because the Node ID information is + * not bound to a thread and can outlive any of the thread specific + * contextes. + */ + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + + if (Recovery_PGXCNodeUnregister(type, node_name, false, myport->sock)) + { + ereport(ERROR, + (EINVAL, + errmsg("Failed to Unregister node"))); + } + + MemoryContextSwitchTo(oldContext); + + pq_getmsgend(message); + + + if (!is_backup) + { + /* + * Backup first + */ + if (GetMyThreadInfo->thr_conn->standby) + { + int _rc; + GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; + int count = 0; + + elog(DEBUG1, "calling node_unregister() for standby GTM %p.", + GetMyThreadInfo->thr_conn->standby); + + retry: + _rc = bkup_node_unregister(GetMyThreadInfo->thr_conn->standby, + type, + node_name); + + + if (gtm_standby_check_communication_error(&count, oldconn)) + goto retry; + + if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) + gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); + + elog(DEBUG1, "node_unregister() returns rc %d.", _rc); + } + /* + * Send a SUCCESS message back to the client + */ + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, NODE_UNREGISTER_RESULT, 4); + if (myport->remote_type == GTM_NODE_GTM_PROXY) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&type, sizeof(GTM_PGXCNodeType)); + /* Node name length */ + pq_sendint(&buf, strlen(node_name), 4); + /* Node name (var-len) */ + pq_sendbytes(&buf, node_name, strlen(node_name)); + + pq_endmessage(myport, &buf); + + /* Flush standby before flush to the client */ + if (myport->remote_type != GTM_NODE_GTM_PROXY) + { + if (GetMyThreadInfo->thr_conn->standby) + gtmpqFlush(GetMyThreadInfo->thr_conn->standby); + pq_flush(myport); + } + } +} + +/* + * Process MSG_NODE_LIST + */ +void +ProcessPGXCNodeList(Port *myport, StringInfo message) +{ + MemoryContext oldContext; + StringInfoData buf; + int num_node = 13; + int i; + + GTM_PGXCNodeInfo *data[MAX_NODES]; + char *s_data[MAX_NODES]; + size_t s_datalen[MAX_NODES]; + + /* + * We must use the TopMostMemoryContext because the Node ID information is + * not bound to a thread and can outlive any of the thread specific + * contextes. + */ + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + + memset(data, 0, sizeof(GTM_PGXCNodeInfo *) * MAX_NODES); + memset(s_data, 0, sizeof(char *) * MAX_NODES); + + num_node = pgxcnode_get_all(data, MAX_NODES); + + for (i = 0; i < num_node; i++) + { + size_t s_len; + + s_len = gtm_get_pgxcnodeinfo_size(data[i]); + + /* + * Allocate memory blocks for serialized GTM_PGXCNodeInfo data. + */ + s_data[i] = (char *)malloc(s_len+1); + memset(s_data[i], 0, s_len+1); + + s_datalen[i] = gtm_serialize_pgxcnodeinfo(data[i], s_data[i], s_len+1); + + elog(DEBUG1, "gtm_get_pgxcnodeinfo_size: s_len=%ld, s_datalen=%ld", s_len, s_datalen[i]); + } + + MemoryContextSwitchTo(oldContext); + + pq_getmsgend(message); + + /* + * Send a SUCCESS message back to the client + */ + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, NODE_LIST_RESULT, 4); + if (myport->remote_type == GTM_NODE_GTM_PROXY) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendint(&buf, num_node, sizeof(int)); /* number of nodes */ + + /* + * Send pairs of GTM_PGXCNodeInfo size and serialized GTM_PGXCNodeInfo body. + */ + for (i = 0; i < num_node; i++) + { + pq_sendint(&buf, s_datalen[i], sizeof(int)); + pq_sendbytes(&buf, s_data[i], s_datalen[i]); + } + + pq_endmessage(myport, &buf); + + if (myport->remote_type != GTM_NODE_GTM_PROXY) + pq_flush(myport); + + /* + * Release memory blocks for the serialized data. + */ + for (i = 0; i < num_node; i++) + { + free(s_data[i]); + } + + elog(DEBUG1, "ProcessPGXCNodeList() ok."); +} + +void +ProcessGTMBeginBackup(Port *myport, StringInfo message) +{ + int ii; + GTM_ThreadInfo *my_threadinfo; + StringInfoData buf; + + pq_getmsgend(message); + my_threadinfo = GetMyThreadInfo; + + for (ii = 0; ii < GTMThreads->gt_array_size; ii++) + { + if (GTMThreads->gt_threads[ii] && GTMThreads->gt_threads[ii] != my_threadinfo) + GTM_RWLockAcquire(>MThreads->gt_threads[ii]->thr_lock, GTM_LOCKMODE_WRITE); + } + my_threadinfo->thr_status = GTM_THREAD_BACKUP; + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, BEGIN_BACKUP_RESULT, 4); + pq_endmessage(myport, &buf); + pq_flush(myport); +} + +void +ProcessGTMEndBackup(Port *myport, StringInfo message) +{ + int ii; + GTM_ThreadInfo *my_threadinfo; + StringInfoData buf; + + pq_getmsgend(message); + my_threadinfo = GetMyThreadInfo; + + for (ii = 0; ii < GTMThreads->gt_array_size; ii++) + { + if (GTMThreads->gt_threads[ii] && GTMThreads->gt_threads[ii] != my_threadinfo) + GTM_RWLockRelease(>MThreads->gt_threads[ii]->thr_lock); + } + my_threadinfo->thr_status = GTM_THREAD_RUNNING; + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, END_BACKUP_RESULT, 4); + pq_endmessage(myport, &buf); + pq_flush(myport); +} + + +static void +finishStandbyConn(GTM_ThreadInfo *thrinfo) +{ + if ((thrinfo->thr_conn != NULL) && (thrinfo->thr_conn->standby != NULL)) + { + GTMPQfinish(thrinfo->thr_conn->standby); + thrinfo->thr_conn->standby = NULL; + } +} + + +#ifdef XCP +/* + * Process MSG_REGISTER_SESSION message + */ +void +ProcessPGXCRegisterSession(Port *myport, StringInfo message) +{ + char coord_name[SP_NODE_NAME]; + int32 coord_procid; + int32 coord_backendid; + int32 len; + MemoryContext oldContext; + int old_procid; + StringInfoData buf; + + len = pq_getmsgint(message, sizeof(len)); + if (len >= SP_NODE_NAME) + ereport(ERROR, + (EINVAL, + errmsg("Invalid name length."))); + + memcpy(coord_name, (char *)pq_getmsgbytes(message, len), len); + coord_name[len] = '\0'; + + coord_procid = pq_getmsgint(message, sizeof(coord_procid)); + + coord_backendid = pq_getmsgint(message, sizeof(coord_backendid)); + + /* + * Check if all required data are supplied + */ + if (len > 0 || coord_procid > 0 || coord_backendid != InvalidBackendId) + { + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + + /* + * Register the session + */ + old_procid = Recovery_PGXCNodeRegisterCoordProcess(coord_name, coord_procid, + coord_backendid); + MemoryContextSwitchTo(oldContext); + + /* + * If there was a session with same backend id clean it up. + */ + if (old_procid) + GTM_CleanupSeqSession(coord_name, old_procid); + } + + /* + * If there is a standby forward the info to it + */ + if (GetMyThreadInfo->thr_conn->standby) + { + int _rc; + GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby; + int count = 0; + GTM_PGXCNodeInfo *standbynode; + + elog(DEBUG1, "calling register_session() for standby GTM %p.", + GetMyThreadInfo->thr_conn->standby); + + do + { + _rc = register_session(GetMyThreadInfo->thr_conn->standby, + coord_name, coord_procid, coord_backendid); + + elog(DEBUG1, "register_session() returns rc %d.", _rc); + } + while (gtm_standby_check_communication_error(&count, oldconn)); + + /* Now check if there're other standby registered. */ + standbynode = find_standby_node_info(); + if (!standbynode) + GTMThreads->gt_standby_ready = false; + + if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY)) + gtm_sync_standby(GetMyThreadInfo->thr_conn->standby); + + } + + /* Make up response */ + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, REGISTER_SESSION_RESULT, 4); + /* For proxy write out header */ + if (myport->remote_type == GTM_NODE_GTM_PROXY) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_endmessage(myport, &buf); + /* Flush connections */ + if (myport->remote_type != GTM_NODE_GTM_PROXY) + { + if (GetMyThreadInfo->thr_conn->standby) + gtmpqFlush(GetMyThreadInfo->thr_conn->standby); + pq_flush(myport); + } +} +#endif diff --git a/src/gtm/recovery/replication.c b/src/gtm/recovery/replication.c new file mode 100644 index 0000000000..bc04b191db --- /dev/null +++ b/src/gtm/recovery/replication.c @@ -0,0 +1,129 @@ +/*------------------------------------------------------------------------- + * + * replication.c + * Controlling the initialization and end of replication process of GTM data + * + * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group + * + * + * IDENTIFICATION + * src/gtm/recovery/replication.c + * + *------------------------------------------------------------------------- + */ +#include "gtm/replication.h" + +#include <fcntl.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "gtm/gtm_c.h" +#include "gtm/gtm.h" +#include "gtm/gtm_txn.h" +#include "gtm/standby_utils.h" +#include "gtm/gtm_standby.h" +#include "gtm/register.h" +#include "gtm/assert.h" +#include <stdio.h> +#include "gtm/libpq.h" +#include "gtm/pqformat.h" +#include "gtm/gtm_msg.h" +#include "gtm/gtm_ip.h" + +/* + * Process MSG_NODE_BEGIN_REPLICATION_INIT + */ +void +ProcessBeginReplicationInitialSyncRequest(Port *myport, StringInfo message) +{ + StringInfoData buf; + MemoryContext oldContext; + + pq_getmsgend(message); + + if (Recovery_IsStandby()) + ereport(ERROR, + (EPERM, + errmsg("Operation not permitted under the standby mode."))); + + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* Acquire global locks to copy resource data to the standby. */ + GTM_RWLockAcquire(>MTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE); + GTM_RWLockAcquire(>MTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE); + elog(DEBUG1, "Prepared for copying data with holding XidGenLock and TransArrayLock."); + + MemoryContextSwitchTo(oldContext); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, NODE_BEGIN_REPLICATION_INIT_RESULT, 4); + if (myport->remote_type == GTM_NODE_GTM_PROXY) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_endmessage(myport, &buf); + + /* + * Beause this command comes from the standby, we don't have to flush + * messages to the standby here. + */ + if (myport->remote_type != GTM_NODE_GTM_PROXY) + pq_flush(myport); + + elog(DEBUG1, "ProcessBeginReplicationInitialSyncRequest() ok."); + + return; +} + +/* + * Process MSG_NODE_END_REPLICATION_INIT + */ +void +ProcessEndReplicationInitialSyncRequest(Port *myport, StringInfo message) +{ + StringInfoData buf; + MemoryContext oldContext; + + pq_getmsgend(message); + + if (Recovery_IsStandby()) + ereport(ERROR, + (EPERM, + errmsg("Operation not permitted under the standby mode."))); + + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Release global locks after copying resource data to the standby. + */ + GTM_RWLockRelease(>MTransactions.gt_TransArrayLock); + GTM_RWLockRelease(>MTransactions.gt_XidGenLock); + elog(DEBUG1, "XidGenLock and TransArrayLock released."); + + MemoryContextSwitchTo(oldContext); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, NODE_END_REPLICATION_INIT_RESULT, 4); + if (myport->remote_type == GTM_NODE_GTM_PROXY) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_endmessage(myport, &buf); + + /* + * Beause this command comes from the standby, we don't have to flush + * messages to the standby here. + */ + if (myport->remote_type != GTM_NODE_GTM_PROXY) + pq_flush(myport); + + elog(DEBUG1, "ProcessEndReplicationInitialSyncRequest() ok."); + + return; +} diff --git a/src/include/Makefile b/src/include/Makefile index 5f5e6819d6..74de25eb3e 100644 --- a/src/include/Makefile +++ b/src/include/Makefile @@ -22,8 +22,7 @@ SUBDIRS = access bootstrap catalog commands datatype executor foreign lib libpq tcop snowball snowball/libstemmer tsearch tsearch/dicts utils \ port port/win32 port/win32_msvc port/win32_msvc/sys \ port/win32/arpa port/win32/netinet port/win32/sys \ - portability \ - gtm + portability gtm # Install all headers install: all installdirs diff --git a/src/include/access/gtm.h b/src/include/access/gtm.h index 5c5692b2c5..5c70872a1b 100644 --- a/src/include/access/gtm.h +++ b/src/include/access/gtm.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * gtm.h - * + * * Module interfacing with GTM definitions * * @@ -16,6 +16,9 @@ extern char *GtmHost; extern int GtmPort; +#ifdef XCP +extern bool IsXidFromGTM; +#endif extern GlobalTransactionId currentGxid; extern bool IsGTMConnected(void); @@ -43,9 +46,15 @@ extern int RegisterGTM(GTM_PGXCNodeType type, GTM_PGXCNodePort port, char *dataf extern int UnregisterGTM(GTM_PGXCNodeType type); /* Sequence interface APIs with GTM */ +extern GTM_Sequence GetCurrentValGTM(char *seqname); +#ifdef XCP +extern GTM_Sequence GetNextValGTM(char *seqname, + GTM_Sequence range, GTM_Sequence *rangemax); +#else extern GTM_Sequence GetNextValGTM(char *seqname); +#endif extern int SetValGTM(char *seqname, GTM_Sequence nextval, bool iscalled); -extern int CreateSequenceGTM(char *seqname, GTM_Sequence increment, +extern int CreateSequenceGTM(char *seqname, GTM_Sequence increment, GTM_Sequence minval, GTM_Sequence maxval, GTM_Sequence startval, bool cycle); extern int AlterSequenceGTM(char *seqname, GTM_Sequence increment, diff --git a/src/include/access/htup.h b/src/include/access/htup.h index c93a7fb36e..fbb802e4c9 100644 --- a/src/include/access/htup.h +++ b/src/include/access/htup.h @@ -4,6 +4,11 @@ * POSTGRES heap tuple definitions. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -534,6 +539,22 @@ typedef HeapTupleData *HeapTuple; */ #define GETSTRUCT(TUP) ((char *) ((TUP)->t_data) + (TUP)->t_data->t_hoff) +#ifdef XCP +/* + * Represents a DataRow message received from a remote node. + * Contains originating node number and message body in DataRow format without + * message code and length. Length and node number are separate fields. + * This is a variable length structure. + */ +typedef struct RemoteDataRowData +{ + Oid msgnode; /* node number of the data row message */ + int msglen; /* length of the data row message */ + char msg[0]; /* last data row message */ +} RemoteDataRowData; +typedef RemoteDataRowData *RemoteDataRow; +#endif + /* * Accessor macros to be used with HeapTuple pointers. */ diff --git a/src/include/access/transam.h b/src/include/access/transam.h index e9e5edda25..b9d46e3504 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -4,6 +4,11 @@ * postgres transaction access method support code * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -173,6 +178,10 @@ extern TransactionId GetNewTransactionId(bool isSubXact, bool *timestamp_receive #else extern TransactionId GetNewTransactionId(bool isSubXact); #endif /* PGXC */ +#ifdef XCP +extern bool TransactionIdIsCurrentGlobalTransactionId(TransactionId xid); +extern TransactionId GetNextTransactionId(void); +#endif extern TransactionId ReadNewTransactionId(void); extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid); diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 82999726a0..127a849c10 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -4,6 +4,11 @@ * postgres transaction system definitions * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -228,8 +233,10 @@ extern TransactionId GetTopTransactionIdIfAny(void); extern TransactionId GetCurrentTransactionId(void); extern TransactionId GetCurrentTransactionIdIfAny(void); #ifdef PGXC /* PGXC_COORD */ +#ifndef XCP extern bool GetCurrentLocalParamStatus(void); extern void SetCurrentLocalParamStatus(bool status); +#endif extern GlobalTransactionId GetAuxilliaryTransactionId(void); extern GlobalTransactionId GetTopGlobalTransactionId(void); extern void SetAuxilliaryTransactionId(GlobalTransactionId gxid); diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h index 47224d3dd2..7f0ed6db2b 100644 --- a/src/include/bootstrap/bootstrap.h +++ b/src/include/bootstrap/bootstrap.h @@ -28,8 +28,7 @@ typedef enum WalReceiverProcess, #ifdef PGXC PoolerProcess, -#endif - +#endif NUM_AUXPROCTYPES /* Must be last! */ } AuxProcType; diff --git a/src/include/catalog/catalog.h b/src/include/catalog/catalog.h index 678a945271..5a1861da6e 100644 --- a/src/include/catalog/catalog.h +++ b/src/include/catalog/catalog.h @@ -4,6 +4,11 @@ * prototypes for functions in backend/catalog/catalog.c * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -36,8 +41,13 @@ extern char *relpathbackend(RelFileNode rnode, BackendId backend, extern char *GetDatabasePath(Oid dbNode, Oid spcNode); /* First argument is a RelFileNodeBackend */ +#ifdef XCP +#define relpath(rnode, forknum) \ + relpathbackend((rnode).node, InvalidBackendId, (forknum)) +#else #define relpath(rnode, forknum) \ relpathbackend((rnode).node, (rnode).backend, (forknum)) +#endif /* First argument is a RelFileNode */ #define relpathperm(rnode, forknum) \ diff --git a/src/include/catalog/namespace.h b/src/include/catalog/namespace.h index 76215dc8a1..0c2d245e90 100644 --- a/src/include/catalog/namespace.h +++ b/src/include/catalog/namespace.h @@ -4,6 +4,11 @@ * prototypes for functions in backend/catalog/namespace.c * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -125,6 +130,9 @@ extern bool isOtherTempNamespace(Oid namespaceId); extern int GetTempNamespaceBackendId(Oid namespaceId); extern Oid GetTempToastNamespace(void); extern void ResetTempTableNamespace(void); +#ifdef XCP +extern void ForgetTempTableNamespace(void); +#endif extern OverrideSearchPath *GetOverrideSearchPath(MemoryContext context); extern OverrideSearchPath *CopyOverrideSearchPath(OverrideSearchPath *path); diff --git a/src/include/catalog/pg_aggregate.h b/src/include/catalog/pg_aggregate.h index ce8fc1b156..7bf70e4ff8 100644 --- a/src/include/catalog/pg_aggregate.h +++ b/src/include/catalog/pg_aggregate.h @@ -5,6 +5,11 @@ * along with the relation's initial contents. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -40,6 +45,9 @@ * aggfinalfn final function (0 if none) * aggsortop associated sort operator (0 if none) * aggtranstype type of aggregate's transition (state) data +#ifdef PGXC + * aggcollecttype type of aggregate's collection (state) data +#endif * agginitval initial value for transition state (can be NULL) #ifdef PGXC * agginitcollect initial value for collection state (can be NULL) @@ -56,6 +64,7 @@ CATALOG(pg_aggregate,2600) BKI_WITHOUT_OIDS regproc aggfinalfn; Oid aggsortop; Oid aggtranstype; + Oid aggcollecttype; /* PGXC */ #ifdef CATALOG_VARLEN /* variable-length fields start here */ text agginitval; @@ -76,15 +85,16 @@ typedef FormData_pg_aggregate *Form_pg_aggregate; */ #ifdef PGXC -#define Natts_pg_aggregate 8 +#define Natts_pg_aggregate 9 #define Anum_pg_aggregate_aggfnoid 1 #define Anum_pg_aggregate_aggtransfn 2 #define Anum_pg_aggregate_aggcollectfn 3 #define Anum_pg_aggregate_aggfinalfn 4 #define Anum_pg_aggregate_aggsortop 5 #define Anum_pg_aggregate_aggtranstype 6 -#define Anum_pg_aggregate_agginitval 7 -#define Anum_pg_aggregate_agginitcollect 8 +#define Anum_pg_aggregate_aggcollecttype 7 +#define Anum_pg_aggregate_agginitval 8 +#define Anum_pg_aggregate_agginitcollect 9 #endif #ifdef PGXC //#define Natts_pg_aggregate 6 @@ -104,13 +114,13 @@ typedef FormData_pg_aggregate *Form_pg_aggregate; /* avg */ #ifdef PGXC -DATA(insert ( 2100 int8_avg_accum numeric_avg_collect numeric_avg 0 1231 "{0,0}" "{0,0}" )); -DATA(insert ( 2101 int4_avg_accum int8_avg_collect int8_avg 0 1016 "{0,0}" "{0,0}" )); -DATA(insert ( 2102 int2_avg_accum int8_avg_collect int8_avg 0 1016 "{0,0}" "{0,0}" )); -DATA(insert ( 2103 numeric_avg_accum numeric_avg_collect numeric_avg 0 1231 "{0,0}" "{0,0}" )); -DATA(insert ( 2104 float4_accum float8_collect float8_avg 0 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2105 float8_accum float8_collect float8_avg 0 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2106 interval_accum interval_collect interval_avg 0 1187 "{0 second,0 second}" "{0 second,0 second}" )); +DATA(insert ( 2100 int8_avg_accum numeric_avg_collect numeric_avg 0 1231 1231 "{0,0}" "{0,0}" )); +DATA(insert ( 2101 int4_avg_accum int8_avg_collect int8_avg 0 1016 1016 "{0,0}" "{0,0}" )); +DATA(insert ( 2102 int2_avg_accum int8_avg_collect int8_avg 0 1016 1016 "{0,0}" "{0,0}" )); +DATA(insert ( 2103 numeric_avg_accum numeric_avg_collect numeric_avg 0 1231 1231 "{0,0}" "{0,0}" )); +DATA(insert ( 2104 float4_accum float8_collect float8_avg 0 1022 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2105 float8_accum float8_collect float8_avg 0 1022 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2106 interval_accum interval_collect interval_avg 0 1187 1187 "{0 second,0 second}" "{0 second,0 second}" )); #endif #ifdef PGXC //DATA(insert ( 2100 int8_avg_accum numeric_avg 0 1231 "{0,0}" )); @@ -124,14 +134,14 @@ DATA(insert ( 2106 interval_accum interval_collect interval_avg 0 1187 "{0 secon /* sum */ #ifdef PGXC -DATA(insert ( 2107 int8_sum numeric_add - 0 1700 _null_ "0" )); -DATA(insert ( 2108 int4_sum int8_sum_to_int8 - 0 20 _null_ _null_ )); -DATA(insert ( 2109 int2_sum int8_sum_to_int8 - 0 20 _null_ _null_ )); -DATA(insert ( 2110 float4pl float4pl - 0 700 _null_ "0" )); -DATA(insert ( 2111 float8pl float8pl - 0 701 _null_ "0" )); -DATA(insert ( 2112 cash_pl cash_pl - 0 790 _null_ _null_ )); -DATA(insert ( 2113 interval_pl interval_pl - 0 1186 _null_ _null_ )); -DATA(insert ( 2114 numeric_add numeric_add - 0 1700 _null_ "0" )); +DATA(insert ( 2107 int8_sum numeric_add - 0 1700 1700 _null_ _null_ )); +DATA(insert ( 2108 int4_sum int8_sum_to_int8 - 0 20 20 _null_ _null_ )); +DATA(insert ( 2109 int2_sum int8_sum_to_int8 - 0 20 20 _null_ _null_ )); +DATA(insert ( 2110 float4pl float4pl - 0 700 700 _null_ _null_ )); +DATA(insert ( 2111 float8pl float8pl - 0 701 701 _null_ _null_ )); +DATA(insert ( 2112 cash_pl cash_pl - 0 790 790 _null_ _null_ )); +DATA(insert ( 2113 interval_pl interval_pl - 0 1186 1186 _null_ _null_ )); +DATA(insert ( 2114 numeric_add numeric_add - 0 1700 1700 _null_ _null_ )); #endif #ifdef PGXC //DATA(insert ( 2107 int8_sum - 0 1700 _null_ )); @@ -146,26 +156,26 @@ DATA(insert ( 2114 numeric_add numeric_add - 0 1700 _null_ "0" )); /* max */ #ifdef PGXC -DATA(insert ( 2115 int8larger int8larger - 413 20 _null_ _null_ )); -DATA(insert ( 2116 int4larger int4larger - 521 23 _null_ _null_ )); -DATA(insert ( 2117 int2larger int2larger - 520 21 _null_ _null_ )); -DATA(insert ( 2118 oidlarger oidlarger - 610 26 _null_ _null_ )); -DATA(insert ( 2119 float4larger float4larger - 623 700 _null_ _null_ )); -DATA(insert ( 2120 float8larger float8larger - 674 701 _null_ _null_ )); -DATA(insert ( 2121 int4larger int4larger - 563 702 _null_ _null_ )); -DATA(insert ( 2122 date_larger date_larger - 1097 1082 _null_ _null_ )); -DATA(insert ( 2123 time_larger time_larger - 1112 1083 _null_ _null_ )); -DATA(insert ( 2124 timetz_larger timetz_larger - 1554 1266 _null_ _null_ )); -DATA(insert ( 2125 cashlarger cashlarger - 903 790 _null_ _null_ )); -DATA(insert ( 2126 timestamp_larger timestamp_larger - 2064 1114 _null_ _null_ )); -DATA(insert ( 2127 timestamptz_larger timestamptz_larger - 1324 1184 _null_ _null_ )); -DATA(insert ( 2128 interval_larger interval_larger - 1334 1186 _null_ _null_ )); -DATA(insert ( 2129 text_larger text_larger - 666 25 _null_ _null_ )); -DATA(insert ( 2130 numeric_larger numeric_larger - 1756 1700 _null_ _null_ )); -DATA(insert ( 2050 array_larger array_larger - 1073 2277 _null_ _null_ )); -DATA(insert ( 2244 bpchar_larger bpchar_larger - 1060 1042 _null_ _null_ )); -DATA(insert ( 2797 tidlarger tidlarger - 2800 27 _null_ _null_ )); -DATA(insert ( 3526 enum_larger enum_larger - 3519 3500 _null_ _null_ )); +DATA(insert ( 2115 int8larger int8larger - 413 20 20 _null_ _null_ )); +DATA(insert ( 2116 int4larger int4larger - 521 23 23 _null_ _null_ )); +DATA(insert ( 2117 int2larger int2larger - 520 21 21 _null_ _null_ )); +DATA(insert ( 2118 oidlarger oidlarger - 610 26 26 _null_ _null_ )); +DATA(insert ( 2119 float4larger float4larger - 623 700 700 _null_ _null_ )); +DATA(insert ( 2120 float8larger float8larger - 674 701 701 _null_ _null_ )); +DATA(insert ( 2121 int4larger int4larger - 563 702 702 _null_ _null_ )); +DATA(insert ( 2122 date_larger date_larger - 1097 1082 1082 _null_ _null_ )); +DATA(insert ( 2123 time_larger time_larger - 1112 1083 1083 _null_ _null_ )); +DATA(insert ( 2124 timetz_larger timetz_larger - 1554 1266 1266 _null_ _null_ )); +DATA(insert ( 2125 cashlarger cashlarger - 903 790 790 _null_ _null_ )); +DATA(insert ( 2126 timestamp_larger timestamp_larger - 2064 1114 1114 _null_ _null_ )); +DATA(insert ( 2127 timestamptz_larger timestamptz_larger - 1324 1184 1184 _null_ _null_ )); +DATA(insert ( 2128 interval_larger interval_larger - 1334 1186 1186 _null_ _null_ )); +DATA(insert ( 2129 text_larger text_larger - 666 25 25 _null_ _null_ )); +DATA(insert ( 2130 numeric_larger numeric_larger - 1756 1700 1700 _null_ _null_ )); +DATA(insert ( 2050 array_larger array_larger - 1073 2277 2277 _null_ _null_ )); +DATA(insert ( 2244 bpchar_larger bpchar_larger - 1060 1042 1042 _null_ _null_ )); +DATA(insert ( 2797 tidlarger tidlarger - 2800 27 27 _null_ _null_ )); +DATA(insert ( 3526 enum_larger enum_larger - 3519 3500 3500 _null_ _null_ )); #endif #ifdef PGXC //DATA(insert ( 2115 int8larger - 413 20 _null_ )); @@ -192,26 +202,26 @@ DATA(insert ( 3526 enum_larger enum_larger - 3519 3500 _null_ _null_ )); /* min */ #ifdef PGXC -DATA(insert ( 2131 int8smaller int8smaller - 412 20 _null_ _null_ )); -DATA(insert ( 2132 int4smaller int4smaller - 97 23 _null_ _null_ )); -DATA(insert ( 2133 int2smaller int2smaller - 95 21 _null_ _null_ )); -DATA(insert ( 2134 oidsmaller oidsmaller - 609 26 _null_ _null_ )); -DATA(insert ( 2135 float4smaller float4smaller - 622 700 _null_ _null_ )); -DATA(insert ( 2136 float8smaller float8smaller - 672 701 _null_ _null_ )); -DATA(insert ( 2137 int4smaller int4smaller - 562 702 _null_ _null_ )); -DATA(insert ( 2138 date_smaller date_smaller - 1095 1082 _null_ _null_ )); -DATA(insert ( 2139 time_smaller time_smaller - 1110 1083 _null_ _null_ )); -DATA(insert ( 2140 timetz_smaller timetz_smaller - 1552 1266 _null_ _null_ )); -DATA(insert ( 2141 cashsmaller cashsmaller - 902 790 _null_ _null_ )); -DATA(insert ( 2142 timestamp_smaller timestamp_smaller - 2062 1114 _null_ _null_ )); -DATA(insert ( 2143 timestamptz_smaller timestamptz_smaller - 1322 1184 _null_ _null_ )); -DATA(insert ( 2144 interval_smaller interval_smaller - 1332 1186 _null_ _null_ )); -DATA(insert ( 2145 text_smaller text_smaller - 664 25 _null_ _null_ )); -DATA(insert ( 2146 numeric_smaller numeric_smaller - 1754 1700 _null_ _null_ )); -DATA(insert ( 2051 array_smaller array_smaller - 1072 2277 _null_ _null_ )); -DATA(insert ( 2245 bpchar_smaller bpchar_smaller - 1058 1042 _null_ _null_ )); -DATA(insert ( 2798 tidsmaller tidsmaller - 2799 27 _null_ _null_ )); -DATA(insert ( 3527 enum_smaller enum_smaller - 3518 3500 _null_ _null_ )); +DATA(insert ( 2131 int8smaller int8smaller - 412 20 20 _null_ _null_ )); +DATA(insert ( 2132 int4smaller int4smaller - 97 23 23 _null_ _null_ )); +DATA(insert ( 2133 int2smaller int2smaller - 95 21 21 _null_ _null_ )); +DATA(insert ( 2134 oidsmaller oidsmaller - 609 26 26 _null_ _null_ )); +DATA(insert ( 2135 float4smaller float4smaller - 622 700 700 _null_ _null_ )); +DATA(insert ( 2136 float8smaller float8smaller - 672 701 701 _null_ _null_ )); +DATA(insert ( 2137 int4smaller int4smaller - 562 702 702 _null_ _null_ )); +DATA(insert ( 2138 date_smaller date_smaller - 1095 1082 1082 _null_ _null_ )); +DATA(insert ( 2139 time_smaller time_smaller - 1110 1083 1083 _null_ _null_ )); +DATA(insert ( 2140 timetz_smaller timetz_smaller - 1552 1266 1266 _null_ _null_ )); +DATA(insert ( 2141 cashsmaller cashsmaller - 902 790 790 _null_ _null_ )); +DATA(insert ( 2142 timestamp_smaller timestamp_smaller - 2062 1114 1114 _null_ _null_ )); +DATA(insert ( 2143 timestamptz_smaller timestamptz_smaller - 1322 1184 1184 _null_ _null_ )); +DATA(insert ( 2144 interval_smaller interval_smaller - 1332 1186 1186 _null_ _null_ )); +DATA(insert ( 2145 text_smaller text_smaller - 664 25 25 _null_ _null_ )); +DATA(insert ( 2146 numeric_smaller numeric_smaller - 1754 1700 1700 _null_ _null_ )); +DATA(insert ( 2051 array_smaller array_smaller - 1072 2277 2277 _null_ _null_ )); +DATA(insert ( 2245 bpchar_smaller bpchar_smaller - 1058 1042 1042 _null_ _null_ )); +DATA(insert ( 2798 tidsmaller tidsmaller - 2799 27 27 _null_ _null_ )); +DATA(insert ( 3527 enum_smaller enum_smaller - 3518 3500 3500 _null_ _null_ )); #endif #ifdef PGXC //DATA(insert ( 2131 int8smaller - 412 20 _null_ )); @@ -239,8 +249,8 @@ DATA(insert ( 3527 enum_smaller enum_smaller - 3518 3500 _null_ _null_ )); /* count */ /* Final function is data type conversion function numeric_int8 is referenced by OID because of ambiguous definition in pg_proc */ #ifdef PGXC -DATA(insert ( 2147 int8inc_any int8_sum_to_int8 - 0 20 "0" "0" )); -DATA(insert ( 2803 int8inc int8_sum_to_int8 - 0 20 "0" "0" )); +DATA(insert ( 2147 int8inc_any int8_sum_to_int8 - 0 20 20 "0" _null_ )); +DATA(insert ( 2803 int8inc int8_sum_to_int8 - 0 20 20 "0" _null_ )); #endif #ifdef PGXC //DATA(insert ( 2147 int8inc_any - 0 20 "0" )); @@ -249,12 +259,12 @@ DATA(insert ( 2803 int8inc int8_sum_to_int8 - 0 20 "0" "0" )); /* var_pop */ #ifdef PGXC -DATA(insert ( 2718 int8_accum numeric_collect numeric_var_pop 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2719 int4_accum numeric_collect numeric_var_pop 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2720 int2_accum numeric_collect numeric_var_pop 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2721 float4_accum float8_collect float8_var_pop 0 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2722 float8_accum float8_collect float8_var_pop 0 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2723 numeric_accum numeric_collect numeric_var_pop 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2718 int8_accum numeric_collect numeric_var_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2719 int4_accum numeric_collect numeric_var_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2720 int2_accum numeric_collect numeric_var_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2721 float4_accum float8_collect float8_var_pop 0 1022 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2722 float8_accum float8_collect float8_var_pop 0 1022 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2723 numeric_accum numeric_collect numeric_var_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); #endif #ifdef PGXC //DATA(insert ( 2718 int8_accum numeric_var_pop 0 1231 "{0,0,0}" )); @@ -267,12 +277,12 @@ DATA(insert ( 2723 numeric_accum numeric_collect numeric_var_pop 0 1231 "{0,0,0 /* var_samp */ #ifdef PGXC -DATA(insert ( 2641 int8_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2642 int4_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2643 int2_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2644 float4_accum float8_collect float8_var_samp 0 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2645 float8_accum float8_collect float8_var_samp 0 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2646 numeric_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2641 int8_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2642 int4_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2643 int2_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2644 float4_accum float8_collect float8_var_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2645 float8_accum float8_collect float8_var_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2646 numeric_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); #endif #ifdef PGXC //DATA(insert ( 2641 int8_accum numeric_var_samp 0 1231 "{0,0,0}" )); @@ -285,12 +295,12 @@ DATA(insert ( 2646 numeric_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0 /* variance: historical Postgres syntax for var_samp */ #ifdef PGXC -DATA(insert ( 2148 int8_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2149 int4_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2150 int2_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2151 float4_accum float8_collect float8_var_samp 0 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2152 float8_accum float8_collect float8_var_samp 0 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2153 numeric_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2148 int8_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2149 int4_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2150 int2_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2151 float4_accum float8_collect float8_var_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2152 float8_accum float8_collect float8_var_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2153 numeric_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); #endif #ifdef PGXC //DATA(insert ( 2148 int8_accum numeric_var_samp 0 1231 "{0,0,0}" )); @@ -303,12 +313,12 @@ DATA(insert ( 2153 numeric_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0 /* stddev_pop */ #ifdef PGXC -DATA(insert ( 2724 int8_accum numeric_collect numeric_stddev_pop 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2725 int4_accum numeric_collect numeric_stddev_pop 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2726 int2_accum numeric_collect numeric_stddev_pop 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2727 float4_accum float8_collect float8_stddev_pop 0 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2728 float8_accum float8_collect float8_stddev_pop 0 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2729 numeric_accum numeric_collect numeric_stddev_pop 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2724 int8_accum numeric_collect numeric_stddev_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2725 int4_accum numeric_collect numeric_stddev_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2726 int2_accum numeric_collect numeric_stddev_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2727 float4_accum float8_collect float8_stddev_pop 0 1022 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2728 float8_accum float8_collect float8_stddev_pop 0 1022 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2729 numeric_accum numeric_collect numeric_stddev_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); #endif #ifdef PGXC //DATA(insert ( 2724 int8_accum numeric_stddev_pop 0 1231 "{0,0,0}" )); @@ -321,12 +331,12 @@ DATA(insert ( 2729 numeric_accum numeric_collect numeric_stddev_pop 0 1231 "{0,0 /* stddev_samp */ #ifdef PGXC -DATA(insert ( 2712 int8_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2713 int4_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2714 int2_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2715 float4_accum float8_collect float8_stddev_samp 0 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2716 float8_accum float8_collect float8_stddev_samp 0 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2717 numeric_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2712 int8_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2713 int4_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2714 int2_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2715 float4_accum float8_collect float8_stddev_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2716 float8_accum float8_collect float8_stddev_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2717 numeric_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); #endif #ifdef PGXC //DATA(insert ( 2712 int8_accum numeric_stddev_samp 0 1231 "{0,0,0}" )); @@ -339,12 +349,12 @@ DATA(insert ( 2717 numeric_accum numeric_collect numeric_stddev_samp 0 1231 "{0, /* stddev: historical Postgres syntax for stddev_samp */ #ifdef PGXC -DATA(insert ( 2154 int8_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2155 int4_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2156 int2_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2157 float4_accum float8_collect float8_stddev_samp 0 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2158 float8_accum float8_collect float8_stddev_samp 0 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2159 numeric_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2154 int8_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2155 int4_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2156 int2_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2157 float4_accum float8_collect float8_stddev_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2158 float8_accum float8_collect float8_stddev_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2159 numeric_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); #endif #ifdef PGXC //DATA(insert ( 2154 int8_accum numeric_stddev_samp 0 1231 "{0,0,0}" )); @@ -357,18 +367,18 @@ DATA(insert ( 2159 numeric_accum numeric_collect numeric_stddev_samp 0 1231 "{0, /* SQL2003 binary regression aggregates */ #ifdef PGXC -DATA(insert ( 2818 int8inc_float8_float8 int8_sum_to_int8 - 0 20 "0" _null_ )); -DATA(insert ( 2819 float8_regr_accum float8_regr_collect float8_regr_sxx 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2820 float8_regr_accum float8_regr_collect float8_regr_syy 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2821 float8_regr_accum float8_regr_collect float8_regr_sxy 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2822 float8_regr_accum float8_regr_collect float8_regr_avgx 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2823 float8_regr_accum float8_regr_collect float8_regr_avgy 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2824 float8_regr_accum float8_regr_collect float8_regr_r2 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2825 float8_regr_accum float8_regr_collect float8_regr_slope 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2826 float8_regr_accum float8_regr_collect float8_regr_intercept 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2827 float8_regr_accum float8_regr_collect float8_covar_pop 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2828 float8_regr_accum float8_regr_collect float8_covar_samp 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2829 float8_regr_accum float8_regr_collect float8_corr 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2818 int8inc_float8_float8 int8_sum_to_int8 - 0 20 20 "0" _null_ )); +DATA(insert ( 2819 float8_regr_accum float8_regr_collect float8_regr_sxx 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2820 float8_regr_accum float8_regr_collect float8_regr_syy 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2821 float8_regr_accum float8_regr_collect float8_regr_sxy 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2822 float8_regr_accum float8_regr_collect float8_regr_avgx 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2823 float8_regr_accum float8_regr_collect float8_regr_avgy 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2824 float8_regr_accum float8_regr_collect float8_regr_r2 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2825 float8_regr_accum float8_regr_collect float8_regr_slope 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2826 float8_regr_accum float8_regr_collect float8_regr_intercept 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2827 float8_regr_accum float8_regr_collect float8_covar_pop 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2828 float8_regr_accum float8_regr_collect float8_covar_samp 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2829 float8_regr_accum float8_regr_collect float8_corr 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); #endif #ifdef PGXC //DATA(insert ( 2818 int8inc_float8_float8 - 0 20 "0" )); @@ -387,9 +397,9 @@ DATA(insert ( 2829 float8_regr_accum float8_regr_collect float8_corr 0 1022 " /* boolean-and and boolean-or */ #ifdef PGXC -DATA(insert ( 2517 booland_statefunc booland_statefunc - 58 16 _null_ _null_ )); -DATA(insert ( 2518 boolor_statefunc boolor_statefunc - 59 16 _null_ _null_ )); -DATA(insert ( 2519 booland_statefunc booland_statefunc - 58 16 _null_ _null_ )); +DATA(insert ( 2517 booland_statefunc booland_statefunc - 58 16 16 _null_ _null_ )); +DATA(insert ( 2518 boolor_statefunc boolor_statefunc - 59 16 16 _null_ _null_ )); +DATA(insert ( 2519 booland_statefunc booland_statefunc - 58 16 16 _null_ _null_ )); #endif #ifdef PGXC //DATA(insert ( 2517 booland_statefunc - 58 16 _null_ )); @@ -399,14 +409,14 @@ DATA(insert ( 2519 booland_statefunc booland_statefunc - 58 16 _null_ _null_ ) /* bitwise integer */ #ifdef PGXC -DATA(insert ( 2236 int2and int2and - 0 21 _null_ _null_ )); -DATA(insert ( 2237 int2or int2or - 0 21 _null_ _null_ )); -DATA(insert ( 2238 int4and int4and - 0 23 _null_ _null_ )); -DATA(insert ( 2239 int4or int4or - 0 23 _null_ _null_ )); -DATA(insert ( 2240 int8and int8and - 0 20 _null_ _null_ )); -DATA(insert ( 2241 int8or int8or - 0 20 _null_ _null_ )); -DATA(insert ( 2242 bitand bitand - 0 1560 _null_ _null_ )); -DATA(insert ( 2243 bitor bitor - 0 1560 _null_ _null_ )); +DATA(insert ( 2236 int2and int2and - 0 21 21 _null_ _null_ )); +DATA(insert ( 2237 int2or int2or - 0 21 21 _null_ _null_ )); +DATA(insert ( 2238 int4and int4and - 0 23 23 _null_ _null_ )); +DATA(insert ( 2239 int4or int4or - 0 23 23 _null_ _null_ )); +DATA(insert ( 2240 int8and int8and - 0 20 20 _null_ _null_ )); +DATA(insert ( 2241 int8or int8or - 0 20 20 _null_ _null_ )); +DATA(insert ( 2242 bitand bitand - 0 1560 1560 _null_ _null_ )); +DATA(insert ( 2243 bitor bitor - 0 1560 1560 _null_ _null_ )); #endif #ifdef PGXC //DATA(insert ( 2236 int2and - 0 21 _null_ )); @@ -421,7 +431,7 @@ DATA(insert ( 2243 bitor bitor - 0 1560 _null_ _null_ )); /* xml */ #ifdef PGXC -DATA(insert ( 2901 xmlconcat2 xmlconcat2 - 0 142 _null_ _null_ )); +DATA(insert ( 2901 xmlconcat2 - - 0 142 0 _null_ _null_ )); #endif #ifdef PGXC //DATA(insert ( 2901 xmlconcat2 - 0 142 _null_ )); @@ -429,7 +439,7 @@ DATA(insert ( 2901 xmlconcat2 xmlconcat2 - 0 142 _null_ _null_ )); /* array */ #ifdef PGXC -DATA(insert ( 2335 array_agg_transfn - array_agg_finalfn 0 2281 _null_ _null_ )); +DATA(insert ( 2335 array_agg_transfn - array_agg_finalfn 0 2281 0 _null_ _null_ )); #endif #ifdef PGXC //DATA(insert ( 2335 array_agg_transfn array_agg_finalfn 0 2281 _null_ )); @@ -437,15 +447,18 @@ DATA(insert ( 2335 array_agg_transfn - array_agg_finalfn 0 2281 _null_ _null_ ) /* text */ #ifdef PGXC -DATA(insert ( 3538 string_agg_transfn - string_agg_finalfn 0 2281 _null_ _null_ )); +DATA(insert (3538 string_agg_transfn - string_agg_finalfn 0 2281 0 _null_ _null_ )); +// XXX function string_agg_delim_transfn is not defined? +//DATA(insert (3538 string_agg_delim_transfn - string_agg_finalfn 0 2281 0 _null_ _null_ )); #endif #ifdef PGXC -//DATA(insert ( 3538 string_agg_transfn string_agg_finalfn 0 2281 _null_ )); +//DATA(insert (3535 string_agg_transfn string_agg_finalfn 0 2281 _null_ )); +//DATA(insert (3538 string_agg_delim_transfn string_agg_finalfn 0 2281 _null_ )); #endif /* bytea */ #ifdef PGXC -DATA(insert ( 3545 bytea_string_agg_transfn - bytea_string_agg_finalfn 0 2281 _null_ _null_ )); +DATA(insert ( 3545 bytea_string_agg_transfn - bytea_string_agg_finalfn 0 2281 0 _null_ _null_ )); #endif #ifdef PGXC //DATA(insert ( 3545 bytea_string_agg_transfn bytea_string_agg_finalfn 0 2281 _null_ )); @@ -465,6 +478,9 @@ extern void AggregateCreate(const char *aggName, List *aggfinalfnName, List *aggsortopName, Oid aggTransType, +#ifdef XCP + Oid aggCollectType, +#endif #ifdef PGXC const char *agginitval, const char *agginitcollect); diff --git a/src/include/catalog/pg_namespace.h b/src/include/catalog/pg_namespace.h index e253921278..2e63e8dd6b 100644 --- a/src/include/catalog/pg_namespace.h +++ b/src/include/catalog/pg_namespace.h @@ -5,6 +5,11 @@ * along with the relation's initial contents. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -75,6 +80,11 @@ DESCR("reserved schema for TOAST tables"); DATA(insert OID = 2200 ( "public" PGUID _null_ )); DESCR("standard public schema"); #define PG_PUBLIC_NAMESPACE 2200 +#ifdef XCP +DATA(insert OID = 9 ( "storm_catalog" PGUID _null_ )); +DESCR("StormDB catalog schema"); +#define STORM_CATALOG_NAMESPACE 9 +#endif /* diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index cda3efa91a..d434303e62 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -4,6 +4,11 @@ * definition of the system "procedure" relation (pg_proc) * along with the relation's initial contents. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -226,10 +231,6 @@ DATA(insert OID = 1258 ( textcat PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 2 DATA(insert OID = 84 ( boolne PGNSP PGUID 12 1 0 0 0 f f f t t f i 2 0 16 "16 16" _null_ _null_ _null_ _null_ boolne _null_ _null_ _null_ )); DATA(insert OID = 89 ( version PGNSP PGUID 12 1 0 0 0 f f f f t f s 0 0 25 "" _null_ _null_ _null_ _null_ pgsql_version _null_ _null_ _null_ )); DESCR("PostgreSQL version string"); -#ifdef PGXC -DATA(insert OID = 90 ( pgxc_version PGNSP PGUID 12 1 0 0 0 f f f f t f s 0 0 25 "" _null_ _null_ _null_ _null_ pgxc_version _null_ _null_ _null_ )); -DESCR("Postgres-XC version string"); -#endif /* OIDS 100 - 199 */ @@ -4670,6 +4671,12 @@ DATA(insert OID = 3202 ( pgxc_node_str PGNSP PGUID 12 1 0 0 0 f f f f t f s 0 0 DESCR("get the name of the node"); DATA(insert OID = 3203 ( pgxc_is_committed PGNSP PGUID 12 1 1 0 0 f f f f t t s 1 0 16 "28" _null_ _null_ _null_ _null_ pgxc_is_committed _null_ _null_ _null_ )); DESCR("is given GXID committed or aborted?"); +DATA(insert OID = 3205 ( pgxc_lock_for_backup PGNSP PGUID 12 1 0 0 0 f f f f t f v 0 0 16 "" _null_ _null_ _null_ _null_ pgxc_lock_for_backup _null_ _null_ _null_ )); +DESCR("lock the cluster for taking backup"); +#ifdef XCP +DATA(insert OID = 3204 ( stormdb_promote_standby PGNSP PGUID 12 1 0 0 0 f f f f t f v 0 0 2278 "" _null_ _null_ _null_ _null_ stormdb_promote_standby _null_ _null_ _null_ )); +DESCR("touch trigger file on a standby machine to end replication"); +#endif #endif /* diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h index 25c664b7c9..f87ec04655 100644 --- a/src/include/catalog/pg_type.h +++ b/src/include/catalog/pg_type.h @@ -5,6 +5,11 @@ * along with the relation's initial contents. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -573,6 +578,9 @@ DATA(insert OID = 2211 ( _regtype PGNSP PGUID -1 f b A f t \054 0 2206 0 arra /* uuid */ DATA(insert OID = 2950 ( uuid PGNSP PGUID 16 f b U f t \054 0 0 2951 uuid_in uuid_out uuid_recv uuid_send - - - c p f 0 -1 0 0 _null_ _null_ _null_ )); DESCR("UUID datatype"); +#ifdef XCP +#define UUIDOID 2950 +#endif DATA(insert OID = 2951 ( _uuid PGNSP PGUID -1 f b A f t \054 0 2950 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ )); /* text search */ diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h index 44c1ae0c8b..a917a06a87 100644 --- a/src/include/commands/sequence.h +++ b/src/include/commands/sequence.h @@ -3,6 +3,11 @@ * sequence.h * prototypes for sequence.c. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -83,6 +88,10 @@ extern void ResetSequence(Oid seq_relid); extern void seq_redo(XLogRecPtr lsn, XLogRecord *rptr); extern void seq_desc(StringInfo buf, uint8 xl_info, char *rec); +#ifdef XCP +#define DEFAULT_CACHEVAL 1 +extern int SequenceRangeVal; +#endif #ifdef PGXC /* * List of actions that registered the callback. diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h index 7d25bf3a31..e262a1d8d3 100644 --- a/src/include/commands/tablecmds.h +++ b/src/include/commands/tablecmds.h @@ -42,11 +42,7 @@ extern void AlterRelationNamespaceInternal(Relation classRel, Oid relOid, extern void CheckTableNotInUse(Relation rel, const char *stmt); -#ifdef PGXC -extern void ExecuteTruncate(TruncateStmt *stmt, const char *sql_statement); -#else extern void ExecuteTruncate(TruncateStmt *stmt); -#endif extern void SetRelationHasSubclass(Oid relationId, bool relhassubclass); diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index 8e5499e4c5..1f7ba47466 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -210,9 +210,4 @@ extern int RI_FKey_trigger_type(Oid tgfoid); extern Datum pg_trigger_depth(PG_FUNCTION_ARGS); -#ifdef PGXC -/* Postgres-XC related functions for triggers */ -extern bool pgxc_check_triggers_shippability(Oid relid, CmdType commandType); -#endif - #endif /* TRIGGER_H */ diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 7a50d2fcb3..7d16edaac5 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -4,6 +4,11 @@ * header file for postgres vacuum cleaner and statistics analyzer * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -161,6 +166,10 @@ extern void vacuum_set_xid_limits(int freeze_min_age, int freeze_table_age, TransactionId *freezeTableLimit); extern void vac_update_datfrozenxid(void); extern void vacuum_delay_point(void); +#ifdef XCP +extern void vacuum_rel_coordinator(Relation onerel); +TargetEntry *make_relation_tle(Oid reloid, const char *relname, const char *column); +#endif /* in commands/vacuumlazy.c */ extern void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, diff --git a/src/include/commands/variable.h b/src/include/commands/variable.h index ebf7757327..14540a7561 100644 --- a/src/include/commands/variable.h +++ b/src/include/commands/variable.h @@ -2,6 +2,11 @@ * variable.h * Routines for handling specialized SET variables. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -33,6 +38,10 @@ extern bool check_client_encoding(char **newval, void **extra, GucSource source) extern void assign_client_encoding(const char *newval, void *extra); extern bool check_session_authorization(char **newval, void **extra, GucSource source); extern void assign_session_authorization(const char *newval, void *extra); +#ifdef XCP +extern bool check_global_session(char **newval, void **extra, GucSource source); +extern void assign_global_session(const char *newval, void *extra); +#endif extern bool check_role(char **newval, void **extra, GucSource source); extern void assign_role(const char *newval, void *extra); extern const char *show_role(void); diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h index 48d01df1b6..b6c2400ffd 100644 --- a/src/include/executor/execdesc.h +++ b/src/include/executor/execdesc.h @@ -5,6 +5,11 @@ * and related modules. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -16,6 +21,9 @@ #define EXECDESC_H #include "nodes/execnodes.h" +#ifdef XCP +#include "pgxc/squeue.h" +#endif #include "tcop/dest.h" @@ -48,6 +56,14 @@ typedef struct QueryDesc EState *estate; /* executor's query-wide state */ PlanState *planstate; /* tree of per-plan-node state */ +#ifdef XCP + SharedQueue squeue; /* the shared memory queue to sent data to other + * nodes */ + int myindex; /* -1 if locally executed subplan is producing + * data and distribute via squeue. Otherwise + * get local data from squeue */ +#endif + /* This is always set NULL by the core system, but plugins can change it */ struct Instrumentation *totaltime; /* total time spent in ExecutorRun */ } QueryDesc; diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 6b9b28fa31..29e8edcc55 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -4,6 +4,11 @@ * support for the POSTGRES executor module * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -61,6 +66,10 @@ #define EXEC_FLAG_SKIP_TRIGGERS 0x0010 /* skip AfterTrigger calls */ #define EXEC_FLAG_WITH_OIDS 0x0020 /* force OIDs in returned tuples */ #define EXEC_FLAG_WITHOUT_OIDS 0x0040 /* force no OIDs in returned tuples */ +#ifdef XCP +/* distributed executor may never execute the plan on this node */ +#define EXEC_FLAG_SUBPLAN 0x0080 +#endif /* @@ -219,6 +228,9 @@ extern void EvalPlanQualEnd(EPQState *epqstate); * prototypes from functions in execProcnode.c */ extern PlanState *ExecInitNode(Plan *node, EState *estate, int eflags); +#ifdef XCP +extern void ExecFinishInitProcNode(PlanState *node); +#endif extern TupleTableSlot *ExecProcNode(PlanState *node); extern Node *MultiExecProcNode(PlanState *node); extern void ExecEndNode(PlanState *node); diff --git a/src/include/executor/producerReceiver.h b/src/include/executor/producerReceiver.h new file mode 100644 index 0000000000..1efd957863 --- /dev/null +++ b/src/include/executor/producerReceiver.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * producerReceiver.h + * prototypes for producerReceiver.c + * + * + * Copyright (c) 2012-2014, TransLattice, Inc. + * + * src/include/executor/producerReceiver.h + * + *------------------------------------------------------------------------- + */ + +#ifndef PRODUCER_RECEIVER_H +#define PRODUCER_RECEIVER_H + +#include "tcop/dest.h" +#include "pgxc/locator.h" +#include "pgxc/squeue.h" + + +extern DestReceiver *CreateProducerDestReceiver(void); + +extern void SetProducerDestReceiverParams(DestReceiver *self, + AttrNumber distKey, + Locator *locator, + SharedQueue squeue); +extern DestReceiver *SetSelfConsumerDestReceiver(DestReceiver *self, + DestReceiver *consumer); +extern void SetProducerTempMemory(DestReceiver *self, MemoryContext tmpcxt); +extern bool ProducerReceiverPushBuffers(DestReceiver *self); + +#endif /* PRODUCER_RECEIVER_H */ diff --git a/src/include/executor/tuptable.h b/src/include/executor/tuptable.h index 6ea58632fd..693037d1e4 100644 --- a/src/include/executor/tuptable.h +++ b/src/include/executor/tuptable.h @@ -4,6 +4,11 @@ * tuple table support stuff * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -118,11 +123,16 @@ typedef struct TupleTableSlot bool tts_slow; /* saved state for slot_deform_tuple */ HeapTuple tts_tuple; /* physical tuple, or NULL if virtual */ #ifdef PGXC +#ifdef XCP + RemoteDataRow tts_datarow; /* Tuple data in DataRow format */ + MemoryContext tts_drowcxt; /* Context to store deformed */ +#else /* * PGXC extension to support tuples sent from remote Datanode. */ char *tts_dataRow; /* Tuple data in DataRow format */ int tts_dataLen; /* Actual length of the data row */ +#endif bool tts_shouldFreeRow; /* should pfree tts_dataRow? */ struct AttInMetadata *tts_attinmeta; /* store here info to extract values from the DataRow */ #endif @@ -161,16 +171,30 @@ extern TupleTableSlot *ExecStoreMinimalTuple(MinimalTuple mtup, TupleTableSlot *slot, bool shouldFree); #ifdef PGXC +#ifdef XCP +extern TupleTableSlot *ExecStoreDataRowTuple(RemoteDataRow datarow, + TupleTableSlot *slot, + bool shouldFree); +#else extern TupleTableSlot *ExecStoreDataRowTuple(char *msg, size_t len, TupleTableSlot *slot, bool shouldFree); #endif +#endif extern TupleTableSlot *ExecClearTuple(TupleTableSlot *slot); extern TupleTableSlot *ExecStoreVirtualTuple(TupleTableSlot *slot); extern TupleTableSlot *ExecStoreAllNullTuple(TupleTableSlot *slot); extern HeapTuple ExecCopySlotTuple(TupleTableSlot *slot); extern MinimalTuple ExecCopySlotMinimalTuple(TupleTableSlot *slot); +#ifdef PGXC +#ifdef XCP +extern RemoteDataRow ExecCopySlotDatarow(TupleTableSlot *slot, + MemoryContext tmpcxt); +#else +extern int ExecCopySlotDatarow(TupleTableSlot *slot, char **datarow); +#endif +#endif extern HeapTuple ExecFetchSlotTuple(TupleTableSlot *slot); extern MinimalTuple ExecFetchSlotMinimalTuple(TupleTableSlot *slot); extern Datum ExecFetchSlotTupleDatum(TupleTableSlot *slot); diff --git a/src/include/gtm/gtm.h b/src/include/gtm/gtm.h index 293d67119b..8affafd358 100644 --- a/src/include/gtm/gtm.h +++ b/src/include/gtm/gtm.h @@ -3,6 +3,11 @@ * gtm.h * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -92,6 +97,10 @@ void GTM_DoForAllOtherThreads(void (* process_routine)(GTM_ThreadInfo *)); GTM_ThreadInfo *GTM_ThreadCreate(GTM_ConnectionInfo *conninfo, void *(* startroutine)(void *)); GTM_ThreadInfo * GTM_GetThreadInfo(GTM_ThreadID thrid); +#ifdef XCP +extern void SaveControlInfo(void); +#define CONTROL_INTERVAL 1000 +#endif /* * pthread keys to get thread specific information diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h index f34f4dbb35..b43f0edaa0 100644 --- a/src/include/gtm/gtm_c.h +++ b/src/include/gtm/gtm_c.h @@ -98,7 +98,7 @@ typedef GTM_SequenceKeyData *GTM_SequenceKey; #define InvalidSequenceValue 0x7fffffffffffffffLL #define SEQVAL_IS_VALID(v) ((v) != InvalidSequenceValue) -#define GTM_MAX_GLOBAL_TRANSACTIONS 4096 +#define GTM_MAX_GLOBAL_TRANSACTIONS 16384 typedef enum GTM_IsolationLevel { diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h index e50701a7a7..9d7e500480 100644 --- a/src/include/gtm/gtm_client.h +++ b/src/include/gtm/gtm_client.h @@ -3,6 +3,11 @@ * gtm_client.h * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -53,12 +58,15 @@ typedef union GTM_ResultData { GTM_SequenceKeyData seqkey; GTM_Sequence seqval; - } grd_seq; /* SEQUENCE_GET_NEXT */ - +#ifdef XCP + GTM_Sequence rangemax; +#endif + } grd_seq; /* SEQUENCE_GET_CURRENT + * SEQUENCE_GET_NEXT */ struct { - int seq_count; - GTM_SeqInfo **seq; + int seq_count; + GTM_SeqInfo *seq; } grd_seq_list; /* SEQUENCE_GET_LIST */ struct @@ -160,7 +168,7 @@ int end_replication_initial_sync(GTM_Conn *); size_t get_node_list(GTM_Conn *, GTM_PGXCNodeInfo *, size_t); GlobalTransactionId get_next_gxid(GTM_Conn *); uint32 get_txn_gxid_list(GTM_Conn *, GTM_Transactions *); -size_t get_sequence_list(GTM_Conn *, GTM_SeqInfo **, size_t); +size_t get_sequence_list(GTM_Conn *, GTM_SeqInfo **); /* * Transaction Management API @@ -248,6 +256,10 @@ int node_unregister(GTM_Conn *conn, GTM_PGXCNodeType type, const char *node_name int bkup_node_unregister(GTM_Conn *conn, GTM_PGXCNodeType type, const char * node_name); int backend_disconnect(GTM_Conn *conn, bool is_postmaster, GTM_PGXCNodeType type, char *node_name); char *node_get_local_addr(GTM_Conn *conn, char *buf, size_t buflen, int *rc); +#ifdef XCP +int register_session(GTM_Conn *conn, const char *coord_name, int coord_procid, + int coord_backendid); +#endif /* * Sequence Management API @@ -268,10 +280,26 @@ int close_sequence(GTM_Conn *conn, GTM_SequenceKey key); int bkup_close_sequence(GTM_Conn *conn, GTM_SequenceKey key); int rename_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_SequenceKey newkey); int bkup_rename_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_SequenceKey newkey); +#ifdef XCP +int get_current(GTM_Conn *conn, GTM_SequenceKey key, + char *coord_name, int coord_procid, GTM_Sequence *result); +int get_next(GTM_Conn *conn, GTM_SequenceKey key, + char *coord_name, int coord_procid, + GTM_Sequence range, GTM_Sequence *result, GTM_Sequence *rangemax); +int bkup_get_next(GTM_Conn *conn, GTM_SequenceKey key, + char *coord_name, int coord_procid, + GTM_Sequence range, GTM_Sequence *result, GTM_Sequence *rangemax); +int set_val(GTM_Conn *conn, GTM_SequenceKey key, char *coord_name, + int coord_procid, GTM_Sequence nextval, bool iscalled); +int bkup_set_val(GTM_Conn *conn, GTM_SequenceKey key, char *coord_name, + int coord_procid, GTM_Sequence nextval, bool iscalled); +#else +GTM_Sequence get_current(GTM_Conn *conn, GTM_SequenceKey key); GTM_Sequence get_next(GTM_Conn *conn, GTM_SequenceKey key); GTM_Sequence bkup_get_next(GTM_Conn *conn, GTM_SequenceKey key); int set_val(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence nextval, bool is_called); int bkup_set_val(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence nextval, bool is_called); +#endif int reset_sequence(GTM_Conn *conn, GTM_SequenceKey key); int bkup_reset_sequence(GTM_Conn *conn, GTM_SequenceKey key); diff --git a/src/include/gtm/gtm_msg.h b/src/include/gtm/gtm_msg.h index 64f27bfeeb..560c4428f6 100644 --- a/src/include/gtm/gtm_msg.h +++ b/src/include/gtm/gtm_msg.h @@ -3,6 +3,11 @@ * gtm_msg.h * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -27,6 +32,9 @@ typedef enum GTM_MessageType MSG_BKUP_NODE_REGISTER, /* Backup of MSG_NODE_REGISTER */ MSG_NODE_UNREGISTER, /* Unregister a PGXC Node with GTM */ MSG_BKUP_NODE_UNREGISTER, /* Backup of MSG_NODE_UNREGISTER */ +#ifdef XCP + MSG_REGISTER_SESSION, /* Register distributed session with GTM */ +#endif MSG_NODE_LIST, /* Get node list */ MSG_NODE_BEGIN_REPLICATION_INIT, MSG_NODE_END_REPLICATION_INIT, @@ -62,6 +70,7 @@ typedef enum GTM_MessageType MSG_SNAPSHOT_GXID_GET, /* Get GXID and snapshot together */ MSG_SEQUENCE_INIT, /* Initialize a new global sequence */ MSG_BKUP_SEQUENCE_INIT, /* Backup of MSG_SEQUENCE_INIT */ + MSG_SEQUENCE_GET_CURRENT,/* Get the current value of sequence */ MSG_SEQUENCE_GET_NEXT, /* Get the next sequence value of sequence */ MSG_BKUP_SEQUENCE_GET_NEXT, /* Backup of MSG_SEQUENCE_GET_NEXT */ MSG_SEQUENCE_GET_LAST, /* Get the last sequence value of sequence */ @@ -99,6 +108,9 @@ typedef enum GTM_ResultType SYNC_STANDBY_RESULT, NODE_REGISTER_RESULT, NODE_UNREGISTER_RESULT, +#ifdef XCP + REGISTER_SESSION_RESULT, +#endif NODE_LIST_RESULT, NODE_BEGIN_REPLICATION_INIT_RESULT, NODE_END_REPLICATION_INIT_RESULT, @@ -122,6 +134,7 @@ typedef enum GTM_ResultType SNAPSHOT_GET_MULTI_RESULT, SNAPSHOT_GXID_GET_RESULT, SEQUENCE_INIT_RESULT, + SEQUENCE_GET_CURRENT_RESULT, SEQUENCE_GET_NEXT_RESULT, SEQUENCE_GET_LAST_RESULT, SEQUENCE_SET_VAL_RESULT, diff --git a/src/include/gtm/gtm_seq.h b/src/include/gtm/gtm_seq.h index af92e6d873..c849dbc884 100644 --- a/src/include/gtm/gtm_seq.h +++ b/src/include/gtm/gtm_seq.h @@ -3,6 +3,11 @@ * gtm_seq.h * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -20,12 +25,29 @@ /* Global sequence related structures */ + +#ifdef XCP +typedef struct GTM_SeqLastVal +{ + char gs_coord_name[SP_NODE_NAME]; + int32 gs_coord_procid; + GTM_Sequence gs_last_value; +} GTM_SeqLastVal; +#endif + + typedef struct GTM_SeqInfo { GTM_SequenceKey gs_key; GTM_Sequence gs_value; GTM_Sequence gs_init_value; +#ifdef XCP + int32 gs_max_lastvals; + int32 gs_lastval_count; + GTM_SeqLastVal *gs_last_values; +#else GTM_Sequence gs_last_value; +#endif GTM_Sequence gs_increment_by; GTM_Sequence gs_min_value; GTM_Sequence gs_max_value; @@ -70,12 +92,24 @@ int GTM_SeqAlter(GTM_SequenceKey seqkey, bool is_restart); int GTM_SeqClose(GTM_SequenceKey seqkey); int GTM_SeqRename(GTM_SequenceKey seqkey, GTM_SequenceKey newseqkey); +#ifdef XCP +int GTM_SeqGetNext(GTM_SequenceKey seqkey, char *coord_name, + int coord_procid, GTM_Sequence range, + GTM_Sequence *result, GTM_Sequence *rangemax); +void GTM_SeqGetCurrent(GTM_SequenceKey seqkey, char *coord_name, + int coord_procid, GTM_Sequence *result); +int GTM_SeqSetVal(GTM_SequenceKey seqkey, char *coord_name, + int coord_procid, GTM_Sequence nextval, bool iscalled); +#else GTM_Sequence GTM_SeqGetNext(GTM_SequenceKey seqkey); +GTM_Sequence GTM_SeqGetCurrent(GTM_SequenceKey seqkey); int GTM_SeqSetVal(GTM_SequenceKey seqkey, GTM_Sequence nextval, bool iscalled); +#endif int GTM_SeqReset(GTM_SequenceKey seqkey); void ProcessSequenceInitCommand(Port *myport, StringInfo message, bool is_backup); +void ProcessSequenceGetCurrentCommand(Port *myport, StringInfo message); void ProcessSequenceGetNextCommand(Port *myport, StringInfo message, bool is_backup); void ProcessSequenceSetValCommand(Port *myport, StringInfo message, bool is_backup); void ProcessSequenceResetCommand(Port *myport, StringInfo message, bool is_backup); @@ -97,4 +131,8 @@ int GTM_SeqRestore(GTM_SequenceKey seqkey, bool cycle, bool called); +#ifdef XCP +void GTM_CleanupSeqSession(char *coord_name, int coord_procid); +#endif + #endif diff --git a/src/include/gtm/gtm_serialize.h b/src/include/gtm/gtm_serialize.h index 1c31299281..2cabeb1a5e 100644 --- a/src/include/gtm/gtm_serialize.h +++ b/src/include/gtm/gtm_serialize.h @@ -36,11 +36,15 @@ size_t gtm_deserialize_transactions(GTM_Transactions *, const char *, size_t); size_t gtm_get_pgxcnodeinfo_size(GTM_PGXCNodeInfo *); size_t gtm_serialize_pgxcnodeinfo(GTM_PGXCNodeInfo *, char *, size_t); +#ifdef XCP +size_t gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *, const char *, size_t, PQExpBuffer *); +#else size_t gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *, const char *, size_t); +#endif size_t gtm_get_sequence_size(GTM_SeqInfo *); size_t gtm_serialize_sequence(GTM_SeqInfo *, char *, size_t); -GTM_SeqInfo *gtm_deserialize_sequence(const char *, size_t); +size_t gtm_deserialize_sequence(GTM_SeqInfo *seq, const char *, size_t); void dump_transactions_elog(GTM_Transactions *, int); void dump_transactioninfo_elog(GTM_TransactionInfo *); diff --git a/src/include/gtm/gtm_standby.h b/src/include/gtm/gtm_standby.h index 448fc49fbc..e9fa57f6bf 100644 --- a/src/include/gtm/gtm_standby.h +++ b/src/include/gtm/gtm_standby.h @@ -51,9 +51,8 @@ void gtm_standby_closeActiveConn(void); void gtm_standby_finishActiveConn(void); -/* Functions to process backup */ -void ProcessGTMBeginBackup(Port *myport, StringInfo message); -void ProcessGTMEndBackup(Port *myport, StringInfo message); + + /* * Startup mode diff --git a/src/include/gtm/gtm_txn.h b/src/include/gtm/gtm_txn.h index 942e46ce0e..57a97eb1c9 100644 --- a/src/include/gtm/gtm_txn.h +++ b/src/include/gtm/gtm_txn.h @@ -169,11 +169,7 @@ typedef struct GTM_Transactions extern GTM_Transactions GTMTransactions; -/* - * This macro should be used with READ lock held on gt_TransArrayLock as the - * number of open transactions might change when counting open transactions - * if a lock is not hold. - */ +/* NOTE: This macro should be used with READ lock held on gt_TransArrayLock! */ #define GTM_CountOpenTransactions() (gtm_list_length(GTMTransactions.gt_open_transactions)) /* diff --git a/src/include/gtm/register.h b/src/include/gtm/register.h index 4d0e99f068..b9cc089952 100644 --- a/src/include/gtm/register.h +++ b/src/include/gtm/register.h @@ -3,6 +3,11 @@ * register.h * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -17,7 +22,6 @@ #include "gtm/libpq-be.h" #include "gtm/gtm_c.h" #include "gtm/gtm_lock.h" -#include "gtm/gtm_list.h" #include "gtm/stringinfo.h" /* @@ -39,6 +43,14 @@ typedef enum GTM_PGXCNodeStatus NODE_DISCONNECTED } GTM_PGXCNodeStatus; +#ifdef XCP +typedef struct GTM_PGXCSession +{ + int gps_coord_proc_id; + int gps_coord_backend_id; +} GTM_PGXCSession; +#endif + typedef struct GTM_PGXCNodeInfo { GTM_PGXCNodeType type; /* Type of node */ @@ -48,10 +60,16 @@ typedef struct GTM_PGXCNodeInfo char *ipaddress; /* IP address of the nodes */ char *datafolder; /* Data folder of the node */ GTM_PGXCNodeStatus status; /* Node status */ +#ifdef XCP + int max_sessions; + int num_sessions; + GTM_PGXCSession *sessions; +#endif GTM_RWLock node_lock; /* Lock on this structure */ int socket; /* socket number used for registration */ } GTM_PGXCNodeInfo; + /* Maximum number of nodes that can be registered */ #define MAX_NODES 1024 @@ -78,6 +96,11 @@ void Recovery_RestoreRegisterInfo(void); void Recovery_SaveRegisterInfo(void); void Recovery_PGXCNodeDisconnect(Port *myport); void Recovery_SaveRegisterFileName(char *dir); +#ifdef XCP +int Recovery_PGXCNodeRegisterCoordProcess(char *coord_node, int coord_procid, + int coord_backendid); +void ProcessPGXCRegisterSession(Port *myport, StringInfo message); +#endif void ProcessPGXCNodeRegister(Port *myport, StringInfo message, bool is_backup); void ProcessPGXCNodeUnregister(Port *myport, StringInfo message, bool is_backup); diff --git a/src/include/libpq/hba.h b/src/include/libpq/hba.h index f3b8be6a0c..12a526e691 100644 --- a/src/include/libpq/hba.h +++ b/src/include/libpq/hba.h @@ -93,4 +93,7 @@ extern int check_usermap(const char *usermap_name, bool case_sensitive); extern bool pg_isblank(const char c); +#ifdef XCP +extern List* get_parsed_hba(void); +#endif #endif /* HBA_H */ diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 5ff0856765..f54522308c 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -10,6 +10,11 @@ * Over time, this has also become the preferred place for widely known * resource-limitation stuff, such as work_mem and check_stack_depth(). * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -300,6 +305,10 @@ extern void SetUserIdAndContext(Oid userid, bool sec_def_context); extern void InitializeSessionUserId(const char *rolename); extern void InitializeSessionUserIdStandalone(void); extern void SetSessionAuthorization(Oid userid, bool is_superuser); +#ifdef XCP +extern void SetGlobalSession(Oid coordid, int coordpid); +extern char *GetClusterUserName(void); +#endif extern Oid GetCurrentRoleId(void); extern void SetCurrentRoleId(Oid roleid, bool is_superuser); @@ -345,7 +354,6 @@ typedef enum ProcessingMode extern ProcessingMode Mode; - #define IsBootstrapProcessingMode() ((bool)(Mode == BootstrapProcessing)) #define IsInitProcessingMode() ((bool)(Mode == InitProcessing)) #define IsNormalProcessingMode() ((bool)(Mode == NormalProcessing)) diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index ef1aa2743d..93b5380051 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -4,6 +4,11 @@ * definitions for executor state nodes * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -22,6 +27,9 @@ #include "utils/reltrigger.h" #include "utils/sortsupport.h" #include "utils/tuplestore.h" +#ifdef XCP +#include "pgxc/squeue.h" +#endif /* ---------------- @@ -348,9 +356,11 @@ typedef struct EState ResultRelInfo *es_result_relations; /* array of ResultRelInfos */ int es_num_result_relations; /* length of array */ ResultRelInfo *es_result_relation_info; /* currently active array elt */ -#ifdef PGXC +#ifdef PGXC +#ifndef PGXC struct PlanState *es_result_remoterel; /* currently active remote rel */ -#endif +#endif +#endif /* Stuff used for firing triggers: */ List *es_trig_target_relations; /* trigger-only ResultRelInfos */ @@ -1061,9 +1071,9 @@ typedef struct ModifyTableState bool canSetTag; /* do we set the command tag/es_processed? */ bool mt_done; /* are we done? */ PlanState **mt_plans; /* subplans (one per target rel) */ -#ifdef PGXC +#ifdef PGXC PlanState **mt_remoterels; /* per-target remote query node */ -#endif +#endif int mt_nplans; /* number of plans in the array */ int mt_whichplan; /* which one is being executed (0..n-1) */ ResultRelInfo *resultRelInfo; /* per-subplan target relations */ @@ -1694,7 +1704,9 @@ typedef struct AggState bool table_filled; /* hash table filled yet? */ TupleHashIterator hashiter; /* for iterating through hash table */ #ifdef PGXC +#ifndef XCP bool skip_trans; /* skip the transition step for aggregates */ +#endif /* XCP */ #endif /* PGXC */ } AggState; diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 00d03b9602..2c9cf5ee15 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -4,6 +4,11 @@ * Definitions for tagged nodes. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -82,9 +87,15 @@ typedef enum NodeTag * TAGS FOR PGXC NODES * (planner.h, locator.h, nodemgr.h, groupmgr.h) */ +#ifdef XCP + T_Distribution, +#endif T_ExecNodes, T_SimpleSort, T_RemoteQuery, +#ifdef XCP + T_RemoteSubplan, +#endif T_PGXCNodeHandle, T_AlterNodeStmt, T_CreateNodeStmt, @@ -139,6 +150,9 @@ typedef enum NodeTag T_LimitState, #ifdef PGXC T_RemoteQueryState, +#ifdef XCP + T_RemoteSubplanState, +#endif #endif /* @@ -261,10 +275,9 @@ typedef enum NodeTag T_PlaceHolderInfo, T_MinMaxAggInfo, T_PlannerParamItem, -#ifdef PGXC - T_RemoteQueryPath, -#endif /* PGXC */ - +#ifdef XCP + T_RemoteSubPath, +#endif /* * TAGS FOR MEMORY NODES (memnodes.h) */ @@ -347,6 +360,7 @@ typedef enum NodeTag T_CheckPointStmt, #ifdef PGXC T_BarrierStmt, + T_PauseClusterStmt, #endif T_CreateSchemaStmt, T_AlterDatabaseStmt, @@ -382,6 +396,9 @@ typedef enum NodeTag T_DropUserMappingStmt, T_ExecDirectStmt, T_CleanConnStmt, +#ifdef XCP + T_RemoteStmt, +#endif T_AlterTableSpaceOptionsStmt, T_SecLabelStmt, T_CreateForeignTableStmt, @@ -518,11 +535,17 @@ extern PGDLLIMPORT Node *newNodeMacroHolder; /* * nodes/{outfuncs.c,print.c} */ +#ifdef XCP +extern void set_portable_output(bool value); +#endif extern char *nodeToString(const void *obj); /* * nodes/{readfuncs.c,read.c} */ +#ifdef XCP +extern void set_portable_input(bool value); +#endif extern void *stringToNode(char *str); /* diff --git a/src/include/nodes/params.h b/src/include/nodes/params.h index 3989006078..a7cdd0d888 100644 --- a/src/include/nodes/params.h +++ b/src/include/nodes/params.h @@ -4,6 +4,11 @@ * Support for finding the values associated with Param nodes. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -97,6 +102,9 @@ typedef struct ParamExecData void *execPlan; /* should be "SubPlanState *" */ Datum value; bool isnull; +#ifdef XCP + Oid ptype; +#endif } ParamExecData; diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index b7b361bc76..e7a4e826c5 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -10,6 +10,11 @@ * the location. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -156,13 +161,17 @@ typedef struct Query List *constraintDeps; /* a list of pg_constraint OIDs that the query * depends on to be semantically valid */ #ifdef PGXC +#ifndef XCP /* need this info for PGXC Planner, may be temporary */ char *sql_statement; /* original query */ + bool qry_finalise_aggs; /* used for queries intended for Datanodes, + * should Datanode finalise the aggregates? */ bool is_local; /* enforce query execution on local node * this is used by EXECUTE DIRECT especially. */ bool is_ins_child_sel_parent;/* true if the query is such an INSERT SELECT that * inserts into a child by selecting from its parent */ #endif +#endif } Query; @@ -713,8 +722,10 @@ typedef struct RangeTblEntry */ #ifdef PGXC +#ifndef XCP char *relname; #endif +#endif /* * Fields valid for a plain relation RTE (else zero): @@ -1253,7 +1264,7 @@ typedef enum AlterTableType AT_AddNodeList, /* ADD NODE nodelist */ AT_DeleteNodeList, /* DELETE NODE nodelist */ #endif - AT_GenericOptions /* OPTIONS (...) */ + AT_GenericOptions, /* OPTIONS (...) */ } AlterTableType; typedef struct AlterTableCmd /* one subcommand of an ALTER TABLE */ @@ -2420,6 +2431,16 @@ typedef struct VacuumStmt #ifdef PGXC /* + * --------------------------- + * Pause Cluster Statement + */ +typedef struct PauseClusterStmt +{ + NodeTag type; + bool pause; /* will be false to unpause */ +} PauseClusterStmt; + +/* * ---------------------- * Barrier Statement */ @@ -2448,6 +2469,7 @@ typedef struct AlterNodeStmt { NodeTag type; char *node_name; + bool cluster; List *options; } AlterNodeStmt; diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index b7aa20b0c9..644dd18a38 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -4,6 +4,11 @@ * definitions for query plan nodes * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -67,6 +72,19 @@ typedef struct PlannedStmt List *invalItems; /* other dependencies, as PlanInvalItems */ int nParamExec; /* number of PARAM_EXEC Params used */ +#ifdef XCP + int nParamRemote; /* number of params sent from the master mode */ + + struct RemoteParam *remoteparams;/* parameter descriptors */ + + const char *pname; /* the portal name */ + + /* Parameters to filter out result rows */ + char distributionType; + AttrNumber distributionKey; + List *distributionNodes; + List *distributionRestrict; +#endif } PlannedStmt; /* macro for fetching the Plan associated with a SubPlan node */ @@ -175,9 +193,11 @@ typedef struct ModifyTable List *returningLists; /* per-target-table RETURNING tlists */ List *rowMarks; /* PlanRowMarks (non-locking only) */ int epqParam; /* ID of Param for EvalPlanQual re-eval */ -#ifdef PGXC +#ifdef PGXC +#ifndef XCP List *remote_plans; /* per-target-table remote node */ -#endif +#endif +#endif } ModifyTable; /* ---------------- @@ -590,12 +610,6 @@ typedef struct Sort Oid *sortOperators; /* OIDs of operators to sort them by */ Oid *collations; /* OIDs of collations */ bool *nullsFirst; /* NULLS FIRST/LAST directions */ -#ifdef PGXC - bool srt_start_merge;/* No need to create the sorted runs. The - * underlying plan provides those runs. Merge - * them. - */ -#endif /* PGXC */ } Sort; /* --------------- @@ -633,18 +647,33 @@ typedef enum AggStrategy AGG_HASHED /* grouped agg, use internal hashtable */ } AggStrategy; +#ifdef XCP +typedef enum AggDistribution +{ + AGG_ONENODE, /* not distributed aggregation */ + AGG_SLAVE, /* execute only transient function */ + AGG_MASTER /* execute collection function as transient + * and final finction */ +} AggDistribution; +#endif + typedef struct Agg { Plan plan; AggStrategy aggstrategy; +#ifdef XCP + AggDistribution aggdistribution; +#endif int numCols; /* number of grouping columns */ AttrNumber *grpColIdx; /* their indexes in the target list */ Oid *grpOperators; /* equality operators to compare with */ long numGroups; /* estimated number of groups in input */ #ifdef PGXC +#ifndef XCP bool skip_trans; /* apply collection directly on the data received * from remote Datanodes */ +#endif /* XCP */ #endif /* PGXC */ } Agg; diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h index 211b2cfc12..3ec44c2b8b 100644 --- a/src/include/nodes/primnodes.h +++ b/src/include/nodes/primnodes.h @@ -7,6 +7,11 @@ * and join trees. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -244,8 +249,10 @@ typedef struct Aggref Oid aggcollid; /* OID of collation of result */ Oid inputcollid; /* OID of collation that function should use */ #ifdef PGXC +#ifndef XCP Oid aggtrantype; /* type Oid of transition results */ bool agghas_collectfn; /* is collection function available */ +#endif /* XCP */ #endif /* PGXC */ List *args; /* arguments and sort expressions */ List *aggorder; /* ORDER BY (list of SortGroupClause) */ diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 964b371517..f036ead7dd 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -4,6 +4,11 @@ * Definitions for planner's internal data structures. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -20,6 +25,25 @@ #include "storage/block.h" +#ifdef XCP +/* + * Distribution + * + * Distribution is an attribute of distributed plan node. It describes on which + * node execution results can be found. + */ +typedef struct Distribution +{ + NodeTag type; + + char distributionType; + Node *distributionExpr; + Bitmapset *nodes; + Bitmapset *restrictNodes; +} Distribution; +#endif + + /* * Relids * Set of relation identifiers (indexes into the rangetable). @@ -229,6 +253,7 @@ typedef struct PlannerInfo bool hasRecursion; /* true if planning a recursive WITH item */ #ifdef PGXC +#ifndef XCP /* This field is used only when RemoteScan nodes are involved */ int rs_alias_index; /* used to build the alias reference */ @@ -242,6 +267,7 @@ typedef struct PlannerInfo */ List *xc_rowMarks; /* list of PlanRowMarks of type ROW_MARK_EXCLUSIVE & ROW_MARK_SHARE */ #endif +#endif /* These fields are used only when hasRecursion is true: */ int wt_param_id; /* PARAM_EXEC ID for the work table */ @@ -250,9 +276,20 @@ typedef struct PlannerInfo /* These fields are workspace for createplan.c */ Relids curOuterRels; /* outer rels above current node */ List *curOuterParams; /* not-yet-assigned NestLoopParams */ +#ifdef XCP + Bitmapset *curOuterRestrict; /* Datanodes where outer plan is executed */ +#endif /* optional private data for join_search_hook, e.g., GEQO */ void *join_search_private; +#ifdef XCP + /* + * This is NULL for a SELECT query (NULL distribution means "Coordinator" + * everywhere in the planner. For INSERT, UPDATE or DELETE it should match + * to the target table distribution. + */ + Distribution *distribution; /* Query result distribution */ +#endif } PlannerInfo; @@ -710,6 +747,9 @@ typedef struct Path List *pathkeys; /* sort ordering of path's output */ /* pathkeys is a List of PathKey nodes; see above */ +#ifdef XCP + Distribution *distribution; +#endif } Path; /* Macro for extracting a path's parameterization relids; beware double eval */ @@ -947,6 +987,14 @@ typedef struct UniquePath List *uniq_exprs; /* expressions to be made unique */ } UniquePath; +#ifdef XCP +typedef struct RemoteSubPath +{ + Path path; + Path *subpath; +} RemoteSubPath; +#endif + /* * All join-type paths share these fields. */ @@ -1028,45 +1076,6 @@ typedef struct HashPath int num_batches; /* number of batches expected */ } HashPath; -#ifdef PGXC -/* - * A remotequery path represents the queries to be sent to the datanode/s - * - * When RemoteQuery plan is created from RemoteQueryPath, we build the query to - * be executed at the datanode. For building such a query, it's important to get - * the RHS relation and LHS relation of the JOIN clause. So, instead of storing - * the outer and inner paths, we find out the RHS and LHS paths and store those - * here. - */ - -typedef struct RemoteQueryPath -{ - Path path; - ExecNodes *rqpath_en; /* List of datanodes to execute the query on */ - /* - * If the path represents a JOIN rel, leftpath and rightpath represent the - * RemoteQuery paths for left (outer) and right (inner) side of the JOIN - * resp. jointype and join_restrictlist pertains to such JOINs. - */ - struct RemoteQueryPath *leftpath; - struct RemoteQueryPath *rightpath; - JoinType jointype; - List *join_restrictlist; /* restrict list corresponding to JOINs, - * only considered if rest of - * the JOIN information is - * available - */ - bool rqhas_unshippable_qual; /* TRUE if there is at least - * one qual which can not be - * shipped to the datanodes - */ - bool rqhas_temp_rel; /* TRUE if one of the base relations - * involved in this path is a temporary - * table. - */ -} RemoteQueryPath; -#endif /* PGXC */ - /* * Restriction clause info. * diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index abc9a69afe..2f40438398 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -4,6 +4,11 @@ * prototypes for costsize.c and clausesel.c. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -26,6 +31,10 @@ #define DEFAULT_CPU_TUPLE_COST 0.01 #define DEFAULT_CPU_INDEX_TUPLE_COST 0.005 #define DEFAULT_CPU_OPERATOR_COST 0.0025 +#ifdef XCP +#define DEFAULT_NETWORK_BYTE_COST 0.001 +#define DEFAULT_REMOTE_QUERY_COST 100.0 +#endif #define DEFAULT_EFFECTIVE_CACHE_SIZE 16384 /* measured in pages */ @@ -48,6 +57,10 @@ extern PGDLLIMPORT double random_page_cost; extern PGDLLIMPORT double cpu_tuple_cost; extern PGDLLIMPORT double cpu_index_tuple_cost; extern PGDLLIMPORT double cpu_operator_cost; +#ifdef XCP +extern PGDLLIMPORT double network_byte_cost; +extern PGDLLIMPORT double remote_query_cost; +#endif extern PGDLLIMPORT int effective_cache_size; extern Cost disable_cost; extern bool enable_seqscan; @@ -62,11 +75,8 @@ extern bool enable_material; extern bool enable_mergejoin; extern bool enable_hashjoin; #ifdef PGXC -extern bool enable_fast_query_shipping; extern bool enable_remotejoin; extern bool enable_remotegroup; -extern bool enable_remotesort; -extern bool enable_remotelimit; #endif extern int constraint_exclusion; @@ -92,7 +102,7 @@ extern void cost_functionscan(Path *path, PlannerInfo *root, extern void cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel); #ifdef PGXC -extern void cost_remotequery(RemoteQueryPath *rqpath, PlannerInfo *root, RelOptInfo *rel); +extern void cost_remotequery(Path *path, PlannerInfo *root, RelOptInfo *baserel); #endif extern void cost_ctescan(Path *path, PlannerInfo *root, RelOptInfo *baserel); extern void cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm); @@ -154,6 +164,11 @@ extern void final_cost_hashjoin(PlannerInfo *root, HashPath *path, extern void cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan); extern void cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root); extern void cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root); +#ifdef XCP +extern void cost_remote_subplan(Path *path, + Cost input_startup_cost, Cost input_total_cost, + double tuples, int width, int replication); +#endif extern void compute_semi_anti_join_factors(PlannerInfo *root, RelOptInfo *outerrel, RelOptInfo *innerrel, diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 493256fbe8..2fd43c0cc6 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -4,6 +4,11 @@ * prototypes for pathnode.c, relnode.c. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -67,8 +72,14 @@ extern ResultPath *create_result_path(List *quals); extern MaterialPath *create_material_path(RelOptInfo *rel, Path *subpath); extern UniquePath *create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, SpecialJoinInfo *sjinfo); +#ifdef XCP +extern Path *create_subqueryscan_path(PlannerInfo *root, RelOptInfo *rel, + List *pathkeys, Relids required_outer, + Distribution *distribution); +#else extern Path *create_subqueryscan_path(PlannerInfo *root, RelOptInfo *rel, List *pathkeys, Relids required_outer); +#endif extern Path *create_functionscan_path(PlannerInfo *root, RelOptInfo *rel); extern Path *create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel); extern Path *create_ctescan_path(PlannerInfo *root, RelOptInfo *rel); @@ -78,6 +89,12 @@ extern ForeignPath *create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel, List *pathkeys, Relids required_outer, List *fdw_private); +#ifdef PGXC +#ifndef XCP +extern Path *create_remotequery_path(PlannerInfo *root, RelOptInfo *rel); +#endif +#endif + extern Relids calc_nestloop_required_outer(Path *outer_path, Path *inner_path); extern Relids calc_non_nestloop_required_outer(Path *outer_path, Path *inner_path); diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 50af07e0f9..b6fb8ee5ce 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -85,19 +85,6 @@ extern void add_paths_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel, JoinType jointype, SpecialJoinInfo *sjinfo, List *restrictlist); -#ifdef PGXC -/* - * rquerypath.c - * routines to create RemoteQuery paths - */ -extern bool create_plainrel_rqpath(PlannerInfo *root, RelOptInfo *rel, - RangeTblEntry *rte); -extern void create_joinrel_rqpath(PlannerInfo *root, RelOptInfo *joinrel, - RelOptInfo *outerrel, RelOptInfo *innerrel, - List *restrictlist, JoinType jointype, - SpecialJoinInfo *sjinfo); -#endif /* PGXC */ - /* * joinrels.c * routines to determine which relations to join diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index ecee00e4e4..39a5650eb6 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -4,6 +4,11 @@ * prototypes for various files in optimizer/plan * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -16,6 +21,9 @@ #include "nodes/plannodes.h" #include "nodes/relation.h" +#ifdef XCP +#include "pgxc/planner.h" +#endif /* GUC parameters */ #define DEFAULT_CURSOR_TUPLE_FRACTION 0.1 @@ -129,19 +137,22 @@ extern void extract_query_dependencies(Node *query, List **invalItems); #ifdef PGXC -/* - * prototypes for plan/pgxcplan.c - */ -extern Plan *create_remotedml_plan(PlannerInfo *root, Plan *topplan, - CmdType cmdtyp); +#ifdef XCP +extern RemoteSubplan *find_push_down_plan(Plan *plan, bool force); +extern RemoteSubplan *make_remotesubplan(PlannerInfo *root, + Plan *lefttree, + Distribution *resultDistribution, + Distribution *execDistribution, + List *pathkeys); +#else +extern Var *search_tlist_for_var(Var *var, List *jtlist); +extern Plan *create_remoteinsert_plan(PlannerInfo *root, Plan *topplan); +extern Plan *create_remoteupdate_plan(PlannerInfo *root, Plan *topplan); +extern Plan *create_remotedelete_plan(PlannerInfo *root, Plan *topplan); extern Plan *create_remotegrouping_plan(PlannerInfo *root, Plan *local_plan); -extern Plan *create_remotequery_plan(PlannerInfo *root, RemoteQueryPath *best_path); -extern Plan *create_remotesort_plan(PlannerInfo *root, Plan *local_plan); -extern Plan *create_remotelimit_plan(PlannerInfo *root, Plan *local_plan); -extern List *pgxc_order_qual_clauses(PlannerInfo *root, List *clauses); -extern List *pgxc_build_relation_tlist(RelOptInfo *rel); -extern void pgxc_copy_path_costsize(Plan *dest, Path *src); -extern Plan *pgxc_create_gating_plan(PlannerInfo *root, Plan *plan, List *quals); -#endif +/* Expose fix_scan_expr to create_remotequery_plan() */ +extern Node *pgxc_fix_scan_expr(PlannerInfo *root, Node *node, int rtoffset); +#endif /* XCP */ +#endif /* PGXC */ #endif /* PLANMAIN_H */ diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h index 1f0993b519..79f71cdd36 100644 --- a/src/include/optimizer/planner.h +++ b/src/include/optimizer/planner.h @@ -38,6 +38,10 @@ extern Plan *subquery_planner(PlannerGlobal *glob, Query *parse, extern bool is_dummy_plan(Plan *plan); extern Expr *expression_planner(Expr *expr); +#ifdef PGXC +extern void GetHashExecNodes(RelationLocInfo *rel_loc_info, + ExecNodes **exec_nodes, const Expr *expr); +#endif extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid); diff --git a/src/include/parser/analyze.h b/src/include/parser/analyze.h index 5fbf520992..dd72351533 100644 --- a/src/include/parser/analyze.h +++ b/src/include/parser/analyze.h @@ -4,6 +4,11 @@ * parse analysis for optimizable statements * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -40,4 +45,7 @@ extern void CheckSelectLocking(Query *qry); extern void applyLockingClause(Query *qry, Index rtindex, bool forUpdate, bool noWait, bool pushedDown); +#ifdef XCP +extern void ParseAnalyze_callback(ParseState *pstate, Query *query); +#endif #endif /* ANALYZE_H */ diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index 33966987e4..977a5ba999 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -292,6 +292,9 @@ PG_KEYWORD("partial", PARTIAL, UNRESERVED_KEYWORD) PG_KEYWORD("partition", PARTITION, UNRESERVED_KEYWORD) PG_KEYWORD("passing", PASSING, UNRESERVED_KEYWORD) PG_KEYWORD("password", PASSWORD, UNRESERVED_KEYWORD) +#ifdef PGXC +PG_KEYWORD("pause", PAUSE, UNRESERVED_KEYWORD) +#endif PG_KEYWORD("placing", PLACING, RESERVED_KEYWORD) PG_KEYWORD("plans", PLANS, UNRESERVED_KEYWORD) PG_KEYWORD("position", POSITION, COL_NAME_KEYWORD) @@ -401,6 +404,9 @@ PG_KEYWORD("unique", UNIQUE, RESERVED_KEYWORD) PG_KEYWORD("unknown", UNKNOWN, UNRESERVED_KEYWORD) PG_KEYWORD("unlisten", UNLISTEN, UNRESERVED_KEYWORD) PG_KEYWORD("unlogged", UNLOGGED, UNRESERVED_KEYWORD) +#ifdef PGXC +PG_KEYWORD("unpause", UNPAUSE, UNRESERVED_KEYWORD) +#endif PG_KEYWORD("until", UNTIL, UNRESERVED_KEYWORD) PG_KEYWORD("update", UPDATE, UNRESERVED_KEYWORD) PG_KEYWORD("user", USER, RESERVED_KEYWORD) diff --git a/src/include/parser/parse_agg.h b/src/include/parser/parse_agg.h index b32ee6c272..19fbb01535 100644 --- a/src/include/parser/parse_agg.h +++ b/src/include/parser/parse_agg.h @@ -3,6 +3,11 @@ * parse_agg.h * handle aggregates and window functions in parser * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -27,11 +32,20 @@ extern void parseCheckWindowFuncs(ParseState *pstate, Query *qry); extern void build_aggregate_fnexprs(Oid *agg_input_types, int agg_num_inputs, Oid agg_state_type, +#ifdef XCP + Oid agg_collect_type, +#endif Oid agg_result_type, Oid agg_input_collation, Oid transfn_oid, +#ifdef XCP + Oid collectfn_oid, +#endif Oid finalfn_oid, Expr **transfnexpr, +#ifdef XCP + Expr **collectfnexpr, +#endif Expr **finalfnexpr); #endif /* PARSE_AGG_H */ diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h index fdd0db682c..355335ae83 100644 --- a/src/include/parser/parse_utilcmd.h +++ b/src/include/parser/parse_utilcmd.h @@ -4,6 +4,11 @@ * parse analysis for utility commands * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -17,8 +22,13 @@ #include "parser/parse_node.h" - +#ifdef XCP +extern bool loose_constraints; +extern List *transformCreateStmt(CreateStmt *stmt, const char *queryString, + bool autodistribute); +#else extern List *transformCreateStmt(CreateStmt *stmt, const char *queryString); +#endif extern List *transformAlterTableStmt(AlterTableStmt *stmt, const char *queryString); extern IndexStmt *transformIndexStmt(IndexStmt *stmt, const char *queryString); diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32 index 5677b6c97f..6a3bdd634d 100644 --- a/src/include/pg_config.h.win32 +++ b/src/include/pg_config.h.win32 @@ -542,13 +542,13 @@ #define MEMSET_LOOP_LIMIT 1024 /* Define to the address where bug reports for this package should be sent. */ -#define PACKAGE_BUGREPORT "[email protected]" +#define PACKAGE_BUGREPORT "[email protected]" /* Define to the full name of this package. */ -#define PACKAGE_NAME "Postgres-XC" +#define PACKAGE_NAME "Postgres-XL" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "Postgres-XC 1.1devel" +#define PACKAGE_STRING "Postgres-XL 9.2.0" /* Define to the version of this package. */ #define PACKAGE_VERSION "9.2beta2" @@ -560,7 +560,7 @@ #define PG_VERSION_NUM 90200 /* Define to the one symbol short name of this package. */ -#define PACKAGE_TARNAME "postgres-xc" +#define PACKAGE_TARNAME "postgres-xl" /* Postgres-XC version as a string */ #define PGXC_VERSION "1.1devel" diff --git a/src/include/pgstat.h b/src/include/pgstat.h index dd978d79c3..fdff029017 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -3,6 +3,11 @@ * * Definitions for the PostgreSQL statistics collector daemon. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Copyright (c) 2001-2012, PostgreSQL Global Development Group * * src/include/pgstat.h @@ -839,6 +844,11 @@ extern void pgstat_count_heap_insert(Relation rel, int n); extern void pgstat_count_heap_update(Relation rel, bool hot); extern void pgstat_count_heap_delete(Relation rel); extern void pgstat_update_heap_dead_tuples(Relation rel, int delta); +#ifdef XCP +extern void pgstat_count_remote_insert(Relation rel, int n); +extern void pgstat_count_remote_update(Relation rel, int n); +extern void pgstat_count_remote_delete(Relation rel, int n); +#endif extern void pgstat_init_function_usage(FunctionCallInfoData *fcinfo, PgStat_FunctionCallUsage *fcu); diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 169be003b4..f52bb3181b 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -5,6 +5,11 @@ * Functions to execute commands on multiple Datanodes * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -18,11 +23,15 @@ #include "locator.h" #include "nodes/nodes.h" #include "pgxcnode.h" +#include "planner.h" +#ifdef XCP +#include "squeue.h" +#include "remotecopy.h" +#endif #include "access/tupdesc.h" #include "executor/tuptable.h" #include "nodes/execnodes.h" #include "nodes/pg_list.h" -#include "optimizer/pgxcplan.h" #include "tcop/dest.h" #include "tcop/pquery.h" #include "utils/snapshot.h" @@ -38,6 +47,10 @@ extern bool EnforceTwoPhaseCommit; #define RESPONSE_DATAROW 3 #define RESPONSE_COPY 4 #define RESPONSE_BARRIER_OK 5 +#ifdef XCP +#define RESPONSE_ERROR 6 +#define RESPONSE_READY 10 +#endif typedef enum { @@ -46,6 +59,10 @@ typedef enum REQUEST_TYPE_QUERY, /* Row description response */ REQUEST_TYPE_COPY_IN, /* Copy In response */ REQUEST_TYPE_COPY_OUT /* Copy Out response */ +#ifdef XCP + , + REQUEST_TYPE_ERROR /* Error, ignore responses */ +#endif } RequestType; /* @@ -66,6 +83,8 @@ typedef struct CombineTag char data[COMPLETION_TAG_BUFSIZE]; /* execution result combination data */ } CombineTag; + +#ifndef XCP /* * Represents a DataRow message received from a remote node. * Contains originating node number and message body in DataRow format without @@ -78,8 +97,18 @@ typedef struct RemoteDataRowData int msgnode; /* node number of the data row message */ } RemoteDataRowData; typedef RemoteDataRowData *RemoteDataRow; +#endif +#ifdef XCP +/* + * Common part for all plan state nodes needed to access remote datanodes + * ResponseCombiner must be the first field of the plan state node so we can + * typecast + */ +typedef struct ResponseCombiner +#else typedef struct RemoteQueryState +#endif { ScanState ss; /* its first field is NodeTag */ int node_count; /* total count of participating nodes */ @@ -93,54 +122,157 @@ typedef struct RemoteQueryState int description_count; /* count of received RowDescription messages */ int copy_in_count; /* count of received CopyIn messages */ int copy_out_count; /* count of received CopyOut messages */ + FILE *copy_file; /* used if copy_dest == COPY_FILE */ + uint64 processed; /* count of data rows handled */ char errorCode[5]; /* error code to send back to client */ char *errorMessage; /* error message to send back to client */ char *errorDetail; /* error detail to send back to client */ - bool query_Done; /* query has been sent down to Datanodes */ +#ifdef XCP + Oid returning_node; /* returning replicated node */ + RemoteDataRow currentRow; /* next data ro to be wrapped into a tuple */ +#else RemoteDataRowData currentRow; /* next data ro to be wrapped into a tuple */ +#endif /* TODO use a tuplestore as a rowbuffer */ List *rowBuffer; /* buffer where rows are stored when connection * should be cleaned for reuse by other RemoteQuery */ +#ifdef XCP + /* + * To handle special case - if there is a simple sort and sort connection + * is buffered. If EOF is reached on a connection it should be removed from + * the array, but we need to know node number of the connection to find + * messages in the buffer. So we store nodenum to that array if reach EOF + * when buffering + */ + Oid *tapenodes; + /* + * If some tape (connection) is buffered, contains a reference on the cell + * right before first row buffered from this tape, needed to speed up + * access to the data + */ + ListCell **tapemarks; + bool merge_sort; /* perform mergesort of node tuples */ + bool extended_query; /* running extended query protocol */ + bool probing_primary; /* trying replicated on primary node */ +#else /* - * To handle special case - if this RemoteQuery is feeding sorted data to - * Sort plan and if the connection fetching data from the Datanode + * To handle special case - if there is a simple sort and sort connection * is buffered. If EOF is reached on a connection it should be removed from * the array, but we need to know node number of the connection to find * messages in the buffer. So we store nodenum to that array if reach EOF * when buffering */ int *tapenodes; - RemoteCopyType remoteCopyType; /* Type of remote COPY operation */ - FILE *copy_file; /* used if remoteCopyType == REMOTE_COPY_FILE */ - uint64 processed; /* count of data rows when running CopyOut */ +#endif + void *tuplesortstate; /* for merge sort */ + /* COPY support */ + RemoteCopyType remoteCopyType; + Tuplestorestate *tuplestorestate; /* cursor support */ char *cursor; /* cursor name */ char *update_cursor; /* throw this cursor current tuple can be updated */ int cursor_count; /* total count of participating nodes */ - PGXCNodeHandle **cursor_connections;/* Datanode connections being combined */ + PGXCNodeHandle **cursor_connections;/* data node connections being combined */ +#ifdef XCP +} ResponseCombiner; + +typedef struct RemoteQueryState +{ + ResponseCombiner combiner; /* see ResponseCombiner struct */ +#endif + bool query_Done; /* query has been sent down to Datanodes */ + /* + * While we are not supporting grouping use this flag to indicate we need + * to initialize collecting of aggregates from the DNs + */ + bool initAggregates; + /* Simple DISTINCT support */ + FmgrInfo *eqfunctions; /* functions to compare tuples */ + MemoryContext tmp_ctx; /* separate context is needed to compare tuples */ /* Support for parameters */ char *paramval_data; /* parameter data, format is like in BIND */ int paramval_len; /* length of parameter values data */ - Oid *rqs_param_types; /* Types of the remote params */ - int rqs_num_params; int eflags; /* capability flags to pass to tuplestore */ bool eof_underlying; /* reached end of underlying plan? */ - Tuplestorestate *tuplestorestate; +#ifndef XCP CommandId rqs_cmd_id; /* Cmd id to use in some special cases */ - int rqs_tapenum; /* Connection from which to fetch next row, - * in case of Sorting */ - TupleTableSlot *rqs_tapedata; /* Data received from this connection to be - * buffered between getlen and readtup calls - * for sort */ - bool rqs_for_sort; /* The row fetches will be handled by Sort */ - bool non_fqs_dml; /* true if this is a non fast query shipped DML - * For detailed discussion on why this variable - * is required see comments in ExecProcNodeDMLInXC */ +#endif } RemoteQueryState; + +#ifdef XCP +typedef struct RemoteParam +{ + ParamKind paramkind; /* kind of parameter */ + int paramid; /* numeric ID for parameter */ + Oid paramtype; /* pg_type OID of parameter's datatype */ +} RemoteParam; + + +/* + * Execution state of a RemoteSubplan node + */ +typedef struct RemoteSubplanState +{ + ResponseCombiner combiner; /* see ResponseCombiner struct */ + char *subplanstr; /* subplan encoded as a string */ + bool bound; /* subplan is sent down to the nodes */ + bool local_exec; /* execute subplan on this datanode */ + Locator *locator; /* determine destination of tuples of + * locally executed plan */ + int *dest_nodes; /* allocate once */ + List *execNodes; /* where to execute subplan */ + /* should query be executed on all (true) or any (false) node specified + * in the execNodes list */ + bool execOnAll; + int nParamRemote; /* number of params sent from the master node */ + RemoteParam *remoteparams; /* parameter descriptors */ +} RemoteSubplanState; + + +/* + * Data needed to set up a PreparedStatement on the remote node and other data + * for the remote executor + */ +typedef struct RemoteStmt +{ + NodeTag type; + + CmdType commandType; /* select|insert|update|delete */ + + bool hasReturning; /* is it insert|update|delete RETURNING? */ + + struct Plan *planTree; /* tree of Plan nodes */ + + List *rtable; /* list of RangeTblEntry nodes */ + + /* rtable indexes of target relations for INSERT/UPDATE/DELETE */ + List *resultRelations; /* integer list of RT indexes, or NIL */ + + List *subplans; /* Plan trees for SubPlan expressions */ + + int nParamExec; /* number of PARAM_EXEC Params used */ + + int nParamRemote; /* number of params sent from the master node */ + + RemoteParam *remoteparams; /* parameter descriptors */ + + List *rowMarks; + + char distributionType; + + AttrNumber distributionKey; + + List *distributionNodes; + + List *distributionRestrict; +} RemoteStmt; +#endif + typedef void (*xact_callback) (bool isCommit, void *args); +#ifndef XCP /* Multinode Executor */ extern void PGXCNodeBegin(void); extern void PGXCNodeSetBeginQuery(char *query_string); @@ -149,51 +281,104 @@ extern int PGXCNodeRollback(void); extern bool PGXCNodePrepare(char *gid); extern bool PGXCNodeRollbackPrepared(char *gid); extern void PGXCNodeCommitPrepared(char *gid); +#endif + /* Copy command just involves Datanodes */ +#ifdef XCP +extern void DataNodeCopyBegin(RemoteCopyData *rcstate); +extern int DataNodeCopyIn(char *data_row, int len, int conn_count, + PGXCNodeHandle** copy_connections); +extern uint64 DataNodeCopyOut(PGXCNodeHandle** copy_connections, + int conn_count, FILE* copy_file); +extern uint64 DataNodeCopyStore(PGXCNodeHandle** copy_connections, + int conn_count, Tuplestorestate* store); +extern void DataNodeCopyFinish(int conn_count, PGXCNodeHandle** connections); +extern int DataNodeCopyInBinaryForAll(char *msg_buf, int len, int conn_count, + PGXCNodeHandle** connections); +#else extern PGXCNodeHandle** DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot); extern int DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections); extern uint64 DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, TupleDesc tupleDesc, FILE* copy_file, Tuplestorestate *store, RemoteCopyType remoteCopyType); extern void DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_dn_index, CombineType combine_type); -extern bool DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error); extern int DataNodeCopyInBinaryForAll(char *msg_buf, int len, PGXCNodeHandle** copy_connections); +#endif +extern bool DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error); +#ifndef XCP extern int ExecCountSlotsRemoteQuery(RemoteQuery *node); +#endif extern RemoteQueryState *ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags); extern TupleTableSlot* ExecRemoteQuery(RemoteQueryState *step); extern void ExecEndRemoteQuery(RemoteQueryState *step); +#ifdef XCP +extern void RemoteSubplanMakeUnique(Node *plan, int unique); +extern RemoteSubplanState *ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags); +extern void ExecFinishInitRemoteSubplan(RemoteSubplanState *node); +extern TupleTableSlot* ExecRemoteSubplan(RemoteSubplanState *node); +extern void ExecEndRemoteSubplan(RemoteSubplanState *node); +extern void ExecReScanRemoteSubplan(RemoteSubplanState *node); +#endif extern void ExecRemoteUtility(RemoteQuery *node); -extern int handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner); extern bool is_data_node_ready(PGXCNodeHandle * conn); -extern void HandleCmdComplete(CmdType commandType, CombineTag *combine, const char *msg_body, size_t len); + +#ifdef XCP +extern int handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner); +#else +extern int handle_response(PGXCNodeHandle *conn, RemoteQueryState *combiner); +#endif +extern void HandleCmdComplete(CmdType commandType, CombineTag *combine, const char *msg_body, + size_t len); + +#ifdef XCP +#define CHECK_OWNERSHIP(conn, node) \ + do { \ + if ((conn)->state == DN_CONNECTION_STATE_QUERY && \ + (conn)->combiner && \ + (conn)->combiner != (ResponseCombiner *) (node)) \ + BufferConnection(conn); \ + (conn)->combiner = (ResponseCombiner *) (node); \ + } while(0) + +extern TupleTableSlot *FetchTuple(ResponseCombiner *combiner); +extern void InitResponseCombiner(ResponseCombiner *combiner, int node_count, + CombineType combine_type); +extern void CloseCombiner(ResponseCombiner *combiner); +#else extern bool FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot); +#endif extern void BufferConnection(PGXCNodeHandle *conn); extern void ExecRemoteQueryReScan(RemoteQueryState *node, ExprContext *exprCtxt); -extern void SetDataRowForExtParams(ParamListInfo params, RemoteQueryState *rq_state); +extern int ParamListToDataRow(ParamListInfo params, char** result); extern void ExecCloseRemoteStatement(const char *stmt_name, List *nodelist); -extern void PreCommit_Remote(char *prepareGID, bool preparedLocalNode); extern char *PrePrepare_Remote(char *prepareGID, bool localNode, bool implicit); +#ifdef XCP +extern void PostPrepare_Remote(char *prepareGID, bool implicit); +extern void PreCommit_Remote(char *prepareGID, char *nodestring, bool preparedLocalNode); +#else extern void PostPrepare_Remote(char *prepareGID, char *nodestring, bool implicit); +extern void PreCommit_Remote(char *prepareGID, bool preparedLocalNode); +#endif extern bool PreAbort_Remote(void); extern void AtEOXact_Remote(void); extern bool IsTwoPhaseCommitRequired(bool localWrite); extern bool FinishRemotePreparedTransaction(char *prepareGID, bool commit); +#ifndef XCP /* Flags related to temporary objects included in query */ extern void ExecSetTempObjectIncluded(void); extern bool ExecIsTempObjectIncluded(void); -extern TupleTableSlot *ExecProcNodeDMLInXC(RemoteQueryState *resultRemoteRel, - TupleTableSlot *slot); +extern void ExecRemoteQueryStandard(Relation resultRelationDesc, RemoteQueryState *resultRemoteRel, TupleTableSlot *slot); extern void pgxc_all_success_nodes(ExecNodes **d_nodes, ExecNodes **c_nodes, char **failednodes_msg); extern void AtEOXact_DBCleanup(bool isCommit); extern void set_dbcleanup_callback(xact_callback function, void *paraminfo, int paraminfo_size); -extern void do_query(RemoteQueryState *node); #endif +#endif diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h index 43ee425c25..145028f962 100644 --- a/src/include/pgxc/locator.h +++ b/src/include/pgxc/locator.h @@ -13,9 +13,13 @@ #ifndef LOCATOR_H #define LOCATOR_H +#ifdef XCP +#include "fmgr.h" +#endif #define LOCATOR_TYPE_REPLICATED 'R' #define LOCATOR_TYPE_HASH 'H' #define LOCATOR_TYPE_RANGE 'G' +#define LOCATOR_TYPE_SINGLE 'S' #define LOCATOR_TYPE_RROBIN 'N' #define LOCATOR_TYPE_CUSTOM 'C' #define LOCATOR_TYPE_MODULO 'M' @@ -43,6 +47,8 @@ #include "nodes/primnodes.h" #include "utils/relcache.h" +typedef int PartAttrNumber; + /* * How relation is accessed in the query */ @@ -56,16 +62,14 @@ typedef enum typedef struct { - Oid relid; /* OID of relation */ - char locatorType; /* locator type, see above */ - AttrNumber partAttrNum; /* Distribution column attribute */ - List *nodeList; /* Node indices where data is located */ - ListCell *roundRobinNode; /* Index of the next node to use */ + Oid relid; + char locatorType; + PartAttrNumber partAttrNum; /* if partitioned */ + char *partAttrName; /* if partitioned */ + List *nodeList; /* Node Indices */ + ListCell *roundRobinNode; /* index of the next one to use */ } RelationLocInfo; -#define IsRelationReplicated(rel_loc) IsLocatorReplicated((rel_loc)->locatorType) -#define IsRelationColumnDistributed(rel_loc) IsLocatorColumnDistributed((rel_loc)->locatorType) -#define IsRelationDistributedByValue(rel_loc) IsLocatorDistributedByValue((rel_loc)->locatorType) /* * Nodes to execute on * primarynodelist is for replicated table writes, where to execute first. @@ -75,52 +79,113 @@ typedef struct typedef struct { NodeTag type; - List *primarynodelist; /* Primary node list indexes */ - List *nodeList; /* Node list indexes */ - char baselocatortype; /* Locator type, see above */ - Expr *en_expr; /* Expression to evaluate at execution time - * if planner can not determine execution - * nodes */ - Oid en_relid; /* Relation to determine execution nodes */ - RelationAccessType accesstype; /* Access type to determine execution - * nodes */ + List *primarynodelist; + List *nodeList; + char baselocatortype; + Expr *en_expr; /* expression to evaluate at execution time if planner + * can not determine execution nodes */ + Oid en_relid; /* Relation to determine execution nodes */ + RelationAccessType accesstype; /* Access type to determine execution nodes */ } ExecNodes; -#define IsExecNodesReplicated(en) IsLocatorReplicated((en)->baselocatortype) -#define IsExecNodesColumnDistributed(en) IsLocatorColumnDistributed((en)->baselocatortype) -#define IsExecNodesDistributedByValue(en) IsLocatorDistributedByValue((en)->baselocatortype) + +#ifdef XCP +typedef enum +{ + LOCATOR_LIST_NONE, /* locator returns integers in range 0..NodeCount-1, + * value of nodeList ignored and can be NULL */ + LOCATOR_LIST_INT, /* nodeList is an integer array (int *), value from + * the array is returned */ + LOCATOR_LIST_OID, /* node list is an array of Oids (Oid *), value from + * the array is returned */ + LOCATOR_LIST_POINTER, /* node list is an array of pointers (void **), + * value from the array is returned */ + LOCATOR_LIST_LIST, /* node list is a list, item type is determined by + * list type (integer, oid or pointer). NodeCount + * is ignored */ +} LocatorListType; + +typedef Datum (*LocatorHashFunc) (PG_FUNCTION_ARGS); + +typedef struct _Locator Locator; + + +/* + * Creates a structure holding necessary info to effectively determine nodes + * where a tuple should be stored. + * Locator does not allocate memory while working, all allocations are made at + * the creation time. + * + * Parameters: + * + * locatorType - see LOCATOR_TYPE_* constants + * accessType - see RelationAccessType enum + * dataType - actual data type of values provided to determine nodes + * listType - defines how nodeList parameter is interpreted, see + * LocatorListType enum for more details + * nodeCount - number of nodes to distribute + * nodeList - detailed info about relation nodes. Either List or array or NULL + * result - returned address of the array where locator will output node + * references. Type of array items (int, Oid or pointer (void *)) + * depends on listType. + * primary - set to true if caller ever wants to determine primary node. + * Primary node will be returned as the first element of the + * result array + */ +extern Locator *createLocator(char locatorType, RelationAccessType accessType, + Oid dataType, LocatorListType listType, int nodeCount, + void *nodeList, void **result, bool primary); +extern void freeLocator(Locator *locator); + +extern int GET_NODES(Locator *self, Datum value, bool isnull, bool *hasprimary); +extern void *getLocatorResults(Locator *self); +extern void *getLocatorNodeMap(Locator *self); +extern int getLocatorNodeCount(Locator *self); +#endif /* Extern variables related to locations */ extern Oid primary_data_node; extern Oid preferred_data_node[MAX_PREFERRED_NODES]; extern int num_preferred_data_nodes; -/* Function for RelationLocInfo building and management */ -extern void RelationBuildLocator(Relation rel); -extern RelationLocInfo *GetRelationLocInfo(Oid relid); -extern RelationLocInfo *CopyRelationLocInfo(RelationLocInfo *srcInfo); -extern void FreeRelationLocInfo(RelationLocInfo *relationLocInfo); -extern char *GetRelationDistribColumn(RelationLocInfo *locInfo); +extern void InitRelationLocInfo(void); extern char GetLocatorType(Oid relid); -extern List *GetPreferredReplicationNode(List *relNodes); -extern bool IsTableDistOnPrimary(RelationLocInfo *locInfo); -extern bool IsLocatorInfoEqual(RelationLocInfo *locInfo1, - RelationLocInfo *locInfo2); -extern int GetRoundRobinNode(Oid relid); -extern bool IsTypeDistributable(Oid colType); -extern bool IsDistribColumn(Oid relid, AttrNumber attNum); -extern ExecNodes *GetRelationNodes(RelationLocInfo *rel_loc_info, - Datum valueForDistCol, - bool isValueNull, - Oid typeOfValueForDistCol, - RelationAccessType accessType); -extern ExecNodes *GetRelationNodesByQuals(Oid reloid, - Index varno, - Node *quals, - RelationAccessType relaccess); -/* Global locator data */ -extern void FreeExecNodes(ExecNodes **exec_nodes); +extern char ConvertToLocatorType(int disttype); + +extern char *GetRelationHashColumn(RelationLocInfo *rel_loc_info); +extern RelationLocInfo *GetRelationLocInfo(Oid relid); +extern RelationLocInfo *CopyRelationLocInfo(RelationLocInfo *src_info); +extern char GetRelationLocType(Oid relid); +extern bool IsTableDistOnPrimary(RelationLocInfo *rel_loc_info); +extern bool IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2); +#ifndef XCP +extern ExecNodes *GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol, + bool isValueNull, Oid typeOfValueForDistCol, + RelationAccessType accessType); +extern ExecNodes *GetRelationNodesByQuals(Oid reloid, Index varno, Node *quals, + RelationAccessType relaccess); +#endif +extern bool IsHashColumn(RelationLocInfo *rel_loc_info, char *part_col_name); +extern bool IsHashColumnForRelId(Oid relid, char *part_col_name); +extern int GetRoundRobinNode(Oid relid); + +extern bool IsTypeHashDistributable(Oid col_type); extern List *GetAllDataNodes(void); extern List *GetAllCoordNodes(void); +#ifdef XCP +extern int GetAnyDataNode(Bitmapset *nodes); +#else +extern List *GetPreferredReplicationNode(List *relNodes); +#endif +extern void RelationBuildLocator(Relation rel); +extern void FreeRelationLocInfo(RelationLocInfo *relationLocInfo); + +extern bool IsTypeModuloDistributable(Oid col_type); +extern char *GetRelationModuloColumn(RelationLocInfo *rel_loc_info); +extern bool IsModuloColumn(RelationLocInfo *rel_loc_info, char *part_col_name); +extern bool IsModuloColumnForRelId(Oid relid, char *part_col_name); +extern char *GetRelationDistColumn(RelationLocInfo *rel_loc_info); +extern bool IsDistColumnForRelId(Oid relid, char *part_col_name); +extern void FreeExecNodes(ExecNodes **exec_nodes); #endif /* LOCATOR_H */ diff --git a/src/include/pgxc/pause.h b/src/include/pgxc/pause.h new file mode 100644 index 0000000000..1ed26ac555 --- /dev/null +++ b/src/include/pgxc/pause.h @@ -0,0 +1,38 @@ +/*------------------------------------------------------------------------- + * + * pause.h + * + * Definitions for the Pause/Unpause Cluster handling + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ + +#ifndef PAUSE_H +#define PAUSE_H + +#include "storage/s_lock.h" + +/* Shared memory area for management of cluster pause/unpause */ +typedef struct { + int cl_holder_pid; /* pid of the process issuing CLUSTER PAUSE */ + int cl_process_count; /* Number of processes undergoing txns */ + + slock_t cl_mutex; /* locks shared variables mentioned above */ +} ClusterLockInfo; + +extern ClusterLockInfo *ClustLinfo; + +extern bool cluster_lock_held; +extern bool cluster_ex_lock_held; + +extern void ClusterLockShmemInit(void); +extern Size ClusterLockShmemSize(void); +extern void AcquireClusterLock(bool exclusive); +extern void ReleaseClusterLock(bool exclusive); + +extern void RequestClusterPause(bool pause, char *completionTag); +extern void PGXCCleanClusterLock(int code, Datum arg); +#endif diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h index 21cd9c6beb..60c0d138b9 100644 --- a/src/include/pgxc/pgxc.h +++ b/src/include/pgxc/pgxc.h @@ -4,6 +4,11 @@ * Postgres-XC flags and connection control information * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2011 PostgreSQL Global Development Group * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -15,9 +20,14 @@ #define PGXC_H #include "storage/lwlock.h" +#include "postgres.h" extern bool isPGXCCoordinator; extern bool isPGXCDataNode; +extern bool isRestoreMode; +#ifdef XCP +extern char *parentPGXCNode; +#endif typedef enum { @@ -36,8 +46,14 @@ extern char *PGXCNodeName; extern int PGXCNodeId; extern uint32 PGXCNodeIdentifier; +extern Datum xc_lockForBackupKey1; +extern Datum xc_lockForBackupKey2; + #define IS_PGXC_COORDINATOR isPGXCCoordinator #define IS_PGXC_DATANODE isPGXCDataNode +#ifdef XCP +#define PGXC_PARENT_NODE parentPGXCNode +#endif #define REMOTE_CONN_TYPE remoteConnType #define IsConnFromApp() (remoteConnType == REMOTE_CONN_APP) @@ -45,4 +61,9 @@ extern uint32 PGXCNodeIdentifier; #define IsConnFromDatanode() (remoteConnType == REMOTE_CONN_DATANODE) #define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM) #define IsConnFromGtmProxy() (remoteConnType == REMOTE_CONN_GTM_PROXY) -#endif /* PGXC_H */ + +/* key pair to be used as object id while using advisory lock for backup */ +#define XC_LOCK_FOR_BACKUP_KEY_1 0xFFFF +#define XC_LOCK_FOR_BACKUP_KEY_2 0xFFFF + +#endif /* PGXC */ diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 31f973cbca..621e4a9a45 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -5,6 +5,11 @@ * Utility functions to communicate to Datanodes and Coordinators * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group ? * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -24,7 +29,6 @@ #define NO_SOCKET -1 - /* Connection to Datanode maintained by Pool Manager */ typedef struct PGconn NODE_CONNECTION; typedef struct PGcancel NODE_CANCEL; @@ -34,6 +38,7 @@ typedef enum { DN_CONNECTION_STATE_IDLE, /* idle, ready for query */ DN_CONNECTION_STATE_QUERY, /* query is sent, response expected */ + DN_CONNECTION_STATE_CLOSE, /* close is sent, confirmation expected */ DN_CONNECTION_STATE_ERROR_FATAL, /* fatal error */ DN_CONNECTION_STATE_COPY_IN, DN_CONNECTION_STATE_COPY_OUT @@ -46,6 +51,7 @@ typedef enum HANDLE_DEFAULT } PGXCNode_HandleRequested; +#ifndef XCP /* * Enumeration for two purposes * 1. To indicate to the HandleCommandComplete function whether response checking is required or not @@ -64,7 +70,7 @@ typedef enum RESP_ROLLBACK_RECEIVED, /* Response is ROLLBACK */ RESP_ROLLBACK_NOT_RECEIVED /* Response is NOT ROLLBACK */ }RESP_ROLLBACK; - +#endif #define DN_CONNECTION_STATE_ERROR(dnconn) \ ((dnconn)->state == DN_CONNECTION_STATE_ERROR_FATAL \ @@ -83,7 +89,12 @@ struct pgxc_node_handle /* Connection state */ char transaction_status; DNConnectionState state; +#ifdef XCP + bool read_only; + struct ResponseCombiner *combiner; +#else struct RemoteQueryState *combiner; +#endif #ifdef DN_CONNECTION_DEBUG bool have_row_desc; #endif @@ -98,14 +109,17 @@ struct pgxc_node_handle size_t inStart; size_t inEnd; size_t inCursor; - /* * Have a variable to enable/disable response checking and * if enable then read the result of response checking * * For details see comments of RESP_ROLLBACK */ +#ifdef XCP + bool ck_resp_rollback; +#else RESP_ROLLBACK ck_resp_rollback; +#endif }; typedef struct pgxc_node_handle PGXCNodeHandle; @@ -122,27 +136,46 @@ typedef struct extern void InitMultinodeExecutor(bool is_force); /* Open/close connection routines (invoked from Pool Manager) */ +#ifdef XCP +extern char *PGXCNodeConnStr(char *host, int port, char *dbname, char *user, + char *remote_type, char *parent_node); +#else extern char *PGXCNodeConnStr(char *host, int port, char *dbname, char *user, char *pgoptions, char *remote_type); +#endif extern NODE_CONNECTION *PGXCNodeConnect(char *connstr); +#ifndef XCP extern int PGXCNodeSendSetQuery(NODE_CONNECTION *conn, const char *sql_command); +#endif extern void PGXCNodeClose(NODE_CONNECTION * conn); extern int PGXCNodeConnected(NODE_CONNECTION * conn); extern int PGXCNodeConnClean(NODE_CONNECTION * conn); extern void PGXCNodeCleanAndRelease(int code, Datum arg); +#ifdef XCP +extern PGXCNodeHandle *get_any_handle(List *datanodelist); +#endif /* Look at information cached in node handles */ +#ifdef XCP +extern int PGXCNodeGetNodeId(Oid nodeoid, char *node_type); +extern int PGXCNodeGetNodeIdFromName(char *node_name, char *node_type); +#else extern int PGXCNodeGetNodeId(Oid nodeoid, char node_type); -extern Oid PGXCNodeGetNodeOid(int nodeid, char node_type); extern int PGXCNodeGetNodeIdFromName(char *node_name, char node_type); +#endif +extern Oid PGXCNodeGetNodeOid(int nodeid, char node_type); extern PGXCNodeAllHandles *get_handles(List *datanodelist, List *coordlist, bool is_query_coord_only); +#ifdef XCP +extern PGXCNodeAllHandles *get_current_handles(void); +#endif extern void pfree_pgxc_all_handles(PGXCNodeAllHandles *handles); extern void release_handles(void); +#ifndef XCP extern void cancel_query(void); extern void clear_all_data(void); - +#endif extern int get_transaction_nodes(PGXCNodeHandle ** connections, char client_conn_type, @@ -171,6 +204,11 @@ extern int pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *que int num_params, Oid *param_types, int paramlen, char *params, bool send_describe, int fetch_size); +#ifdef XCP +extern int pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement, + const char *query, const char *planstr, + short num_params, Oid *param_types); +#endif extern int pgxc_node_send_gxid(PGXCNodeHandle * handle, GlobalTransactionId gxid); extern int pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid); extern int pgxc_node_send_snapshot(PGXCNodeHandle * handle, Snapshot snapshot); @@ -185,8 +223,10 @@ extern int send_some(PGXCNodeHandle * handle, int len); extern int pgxc_node_flush(PGXCNodeHandle *handle); extern void pgxc_node_flush_read(PGXCNodeHandle *handle); +#ifndef XCP extern int pgxc_all_handles_send_gxid(PGXCNodeAllHandles *pgxc_handles, GlobalTransactionId gxid, bool stop_at_error); extern int pgxc_all_handles_send_query(PGXCNodeAllHandles *pgxc_handles, const char *buffer, bool stop_at_error); +#endif extern char get_message(PGXCNodeHandle *conn, int *len, char **msg); @@ -194,4 +234,13 @@ extern void add_error_message(PGXCNodeHandle * handle, const char *message); extern Datum pgxc_execute_on_nodes(int numnodes, Oid *nodelist, char *query); +#ifdef XCP +extern void PGXCNodeSetParam(bool local, const char *name, const char *value); +extern void PGXCNodeResetParams(bool only_local); +extern char *PGXCNodeGetSessionParamStr(void); +extern char *PGXCNodeGetTransactionParamStr(void); +extern void pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query); +extern void RequestInvalidateRemoteHandles(void); +#endif + #endif /* PGXCNODE_H */ diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h new file mode 100644 index 0000000000..6ee83fb0c0 --- /dev/null +++ b/src/include/pgxc/planner.h @@ -0,0 +1,236 @@ +/*------------------------------------------------------------------------- + * + * planner.h + * Externally declared locator functions + * + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 2010-2012 Postgres-XC Development Group + * + * src/include/pgxc/planner.h + * + *------------------------------------------------------------------------- + */ +#ifndef PGXCPLANNER_H +#define PGXCPLANNER_H + +#include "fmgr.h" +#include "lib/stringinfo.h" +#include "nodes/params.h" +#include "nodes/parsenodes.h" +#include "nodes/plannodes.h" +#include "nodes/primnodes.h" +#include "pgxc/locator.h" +#include "tcop/dest.h" +#include "nodes/relation.h" + + +typedef enum +{ + COMBINE_TYPE_NONE, /* it is known that no row count, do not parse */ + COMBINE_TYPE_SUM, /* sum row counts (partitioned, round robin) */ + COMBINE_TYPE_SAME /* expect all row counts to be the same (replicated write) */ +} CombineType; + +/* For sorting within RemoteQuery handling */ +/* + * It is pretty much like Sort, but without Plan. We may use Sort later. + */ +typedef struct +{ + NodeTag type; + int numCols; /* number of sort-key columns */ + AttrNumber *sortColIdx; /* their indexes in the target list */ + Oid *sortOperators; /* OIDs of operators to sort them by */ + Oid *sortCollations; + bool *nullsFirst; /* NULLS FIRST/LAST directions */ +} SimpleSort; + +/* + * Determines if query has to be launched + * on Coordinators only (SEQUENCE DDL), + * on Datanodes (normal Remote Queries), + * or on all Postgres-XC nodes (Utilities and DDL). + */ +typedef enum +{ +#ifdef XCP + EXEC_ON_CURRENT, +#endif + EXEC_ON_DATANODES, + EXEC_ON_COORDS, + EXEC_ON_ALL_NODES, + EXEC_ON_NONE +} RemoteQueryExecType; + +typedef enum +{ + EXEC_DIRECT_NONE, + EXEC_DIRECT_LOCAL, + EXEC_DIRECT_LOCAL_UTILITY, + EXEC_DIRECT_UTILITY, + EXEC_DIRECT_SELECT, + EXEC_DIRECT_INSERT, + EXEC_DIRECT_UPDATE, + EXEC_DIRECT_DELETE +} ExecDirectType; + +/* + * Contains instructions on processing a step of a query. + * In the prototype this will be simple, but it will eventually + * evolve into a GridSQL-style QueryStep. + */ +typedef struct +{ + Scan scan; + ExecDirectType exec_direct_type; /* track if remote query is execute direct and what type it is */ + char *sql_statement; + ExecNodes *exec_nodes; /* List of Datanodes where to launch query */ + CombineType combine_type; + SimpleSort *sort; + bool read_only; /* do not use 2PC when committing read only steps */ + bool force_autocommit; /* some commands like VACUUM require autocommit mode */ + char *statement; /* if specified use it as a PreparedStatement name on Datanodes */ + char *cursor; /* if specified use it as a Portal name on Datanodes */ + int remote_num_params; /* number of parameters specified for Prepared remote statement */ + Oid *remote_param_types; /* parameter types, this pointer is shared + * across all the RemoteQuery nodes in the + * plan. So, don't change this once set. + */ + RemoteQueryExecType exec_type; +#ifndef XCP + bool is_temp; /* determine if this remote node is based + * on a temporary objects (no 2PC) */ +#endif + int reduce_level; /* in case of reduced JOIN, it's level */ + List *base_tlist; /* in case of isReduced, the base tlist */ + char *outer_alias; + char *inner_alias; + int outer_reduce_level; + int inner_reduce_level; + Relids outer_relids; + Relids inner_relids; + char *inner_statement; + char *outer_statement; + char *join_condition; + bool has_row_marks; /* Did SELECT had FOR UPDATE/SHARE? */ + bool has_ins_child_sel_parent; /* This node is part of an INSERT SELECT that + * inserts into child by selecting from its parent */ +} RemoteQuery; + + +#ifdef XCP +/* + * Going to be a RemoteQuery replacement. + * Submit left subplan to the nodes defined by the Distribution and combine + * results. + */ +typedef struct +{ + Scan scan; + char distributionType; + AttrNumber distributionKey; + List *distributionNodes; + List *distributionRestrict; + List *nodeList; + bool execOnAll; + SimpleSort *sort; + char *cursor; + int unique; +} RemoteSubplan; +#endif + + +/* + * FQS_context + * This context structure is used by the Fast Query Shipping walker, to gather + * information during analysing query for Fast Query Shipping. + */ +typedef struct +{ + bool sc_for_expr; /* if false, the we are checking shippability + * of the Query, otherwise, we are checking + * shippability of a stand-alone expression. + */ + Bitmapset *sc_shippability; /* The conditions for (un)shippability of the + * query. + */ + Query *sc_query; /* the query being analysed for FQS */ + int sc_query_level; /* level of the query */ + int sc_max_varlevelsup; /* maximum upper level referred to by any + * variable reference in the query. If this + * value is greater than 0, the query is not + * shippable, if shipped alone. + */ + ExecNodes *sc_exec_nodes; /* nodes where the query should be executed */ + ExecNodes *sc_subquery_en; /* ExecNodes produced by merging the ExecNodes + * for individual subqueries. This gets + * ultimately merged with sc_exec_nodes. + */ +} Shippability_context; + +/* enum for reasons as to why a query/expression is not FQSable */ +typedef enum +{ + SS_UNSHIPPABLE_EXPR = 0, /* it has unshippable expression */ + SS_NEED_SINGLENODE, /* Has expressions which can be evaluated when + * there is only a single node involved. + * Athought aggregates too fit in this class, we + * have a separate status to report aggregates, + * see below. + */ + SS_NEEDS_COORD, /* the query needs Coordinator */ + SS_VARLEVEL, /* one of its subqueries has a VAR + * referencing an upper level query + * relation + */ + SS_NO_NODES, /* no suitable nodes can be found to ship + * the query + */ + SS_UNSUPPORTED_EXPR, /* it has expressions currently unsupported + * by FQS, but such expressions might be + * supported by FQS in future + */ + SS_HAS_AGG_EXPR /* it has aggregate expressions */ +} ShippabilityStat; + +#ifndef XCP +/* global variable corresponding to the GUC with same name */ +extern bool enable_fast_query_shipping; +/* forbid SQL if unsafe, useful to turn off for development */ +extern bool StrictStatementChecking; + +/* forbid SELECT even multi-node ORDER BY */ +extern bool StrictSelectChecking; + +extern PlannedStmt *pgxc_planner(Query *query, int cursorOptions, + ParamListInfo boundParams); +extern bool IsHashDistributable(Oid col_type); + +extern ExecNodes *IsJoinReducible(RemoteQuery *innernode, RemoteQuery *outernode, + Relids in_relids, Relids out_relids, + Join *join, JoinPath *join_path, List *rtable); + +extern List *AddRemoteQueryNode(List *stmts, const char *queryString, + RemoteQueryExecType remoteExecType, bool is_temp); +extern bool pgxc_query_contains_temp_tables(List *queries); +extern Expr *pgxc_find_distcol_expr(Index varno, PartAttrNumber partAttrNum, +extern bool pgxc_query_contains_utility(List *queries); +#endif +extern bool pgxc_shippability_walker(Node *node, Shippability_context *sc_context); +extern bool pgxc_test_shippability_reason(Shippability_context *context, + ShippabilityStat reason); + +#ifdef XCP +extern PlannedStmt *pgxc_direct_planner(Query *query, int cursorOptions, + ParamListInfo boundParams); +extern List *AddRemoteQueryNode(List *stmts, const char *queryString, + RemoteQueryExecType remoteExecType); +#endif + +#endif /* PGXCPLANNER_H */ diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h index 7181968166..1fc04f2007 100644 --- a/src/include/pgxc/poolmgr.h +++ b/src/include/pgxc/poolmgr.h @@ -5,6 +5,11 @@ * Definitions for the Datanode connection pool. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2012 Postgres-XC Development Group * @@ -24,6 +29,7 @@ #define MAX_IDLE_TIME 60 +#ifndef XCP /* * List of flags related to pooler connection clean up when disconnecting * a session or relaeasing handles. @@ -57,11 +63,16 @@ typedef enum POOL_CMD_LOCAL_SET, /* Local SET flag, current transaction block only */ POOL_CMD_GLOBAL_SET /* Global SET flag */ } PoolCommandType; +#endif /* Connection pool entry */ typedef struct { +#ifdef XCP + time_t released; +#else struct timeval released; +#endif NODE_CONNECTION *conn; NODE_CANCEL *xc_cancelConn; } PGXCNodePoolSlot; @@ -81,11 +92,16 @@ typedef struct databasepool { char *database; char *user_name; +#ifndef XCP char *pgoptions; /* Connection options */ +#endif HTAB *nodePools; /* Hashtable of PGXCNodePool, one entry for each * Coordinator or DataNode */ MemoryContext mcxt; struct databasepool *next; /* Reference to next to organize linked list */ +#ifdef XCP + time_t oldest_idle; +#endif } DatabasePool; /* @@ -107,19 +123,28 @@ typedef struct Oid *coord_conn_oids; /* one for each Coordinator */ PGXCNodePoolSlot **dn_connections; /* one for each Datanode */ PGXCNodePoolSlot **coord_connections; /* one for each Coordinator */ +#ifndef XCP char *session_params; char *local_params; bool is_temp; /* Temporary objects used for this pool session? */ +#endif } PoolAgent; +#ifndef XCP /* Handle to the pool manager (Session's side) */ typedef struct { /* communication channel */ PoolPort port; } PoolHandle; +#endif +#ifdef XCP +extern int PoolConnKeepAlive; +extern int PoolMaintenanceTimeout; +#else extern int MinPoolSize; +#endif extern int MaxPoolSize; extern int PoolerPort; @@ -135,6 +160,7 @@ extern int PoolManagerInit(void); /* Destroy internal structures */ extern int PoolManagerDestroy(void); +#ifndef XCP /* * Get handle to pool manager. This function should be called just before * forking off new session. It creates PoolHandle, PoolAgent and a pipe between @@ -150,12 +176,14 @@ extern PoolHandle *GetPoolManagerHandle(void); * free memory occupied by PoolHandler */ extern void PoolManagerCloseHandle(PoolHandle *handle); +#endif /* * Gracefully close connection to the PoolManager */ extern void PoolManagerDisconnect(void); +#ifndef XCP extern char *session_options(void); /* @@ -166,6 +194,7 @@ extern char *session_options(void); extern void PoolManagerConnect(PoolHandle *handle, const char *database, const char *user_name, char *pgoptions); +#endif /* * Reconnect to pool manager @@ -173,6 +202,8 @@ extern void PoolManagerConnect(PoolHandle *handle, */ extern void PoolManagerReconnect(void); + +#ifndef XCP /* * Save a SET command in Pooler. * This command is run on existent agent connections @@ -180,6 +211,7 @@ extern void PoolManagerReconnect(void); * are requested. */ extern int PoolManagerSetCommand(PoolCommandType command_type, const char *set_command); +#endif /* Get pooled connections */ extern int *PoolManagerGetConnections(List *datanodelist, List *coordlist); @@ -197,7 +229,11 @@ extern void PoolManagerReloadConnectionInfo(void); extern int PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids); /* Return connections back to the pool, for both Coordinator and Datanode connections */ +#ifdef XCP +extern void PoolManagerReleaseConnections(bool destroy); +#else extern void PoolManagerReleaseConnections(void); +#endif /* Cancel a running query on Datanodes as well as on other Coordinators */ extern void PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list); @@ -205,10 +241,12 @@ extern void PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int /* Lock/unlock pool manager */ extern void PoolManagerLock(bool is_lock); +#ifndef XCP /* Check if pool has a handle */ extern bool IsPoolHandle(void); /* Send commands to alter the behavior of current transaction */ extern int PoolManagerSendLocalCommand(int dn_count, int* dn_list, int co_count, int* co_list); +#endif #endif diff --git a/src/include/pgxc/postgresql_fdw.h b/src/include/pgxc/postgresql_fdw.h new file mode 100644 index 0000000000..57ab2b7d1d --- /dev/null +++ b/src/include/pgxc/postgresql_fdw.h @@ -0,0 +1,23 @@ +/*------------------------------------------------------------------------- + * + * postgresql_fdw.h + * + * foreign-data wrapper for PostgreSQL + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 2010-2012, Postgres-XC Development Group + * + * src/include/pgxc/postgresql_fdw.h + * + *------------------------------------------------------------------------- + */ + +#ifndef POSTGRES_FDW_H +#define POSTGRES_FDW_H + +#include "postgres.h" +#include "pgxc/execRemote.h" + +bool is_immutable_func(Oid funcid); +bool pgxc_is_expr_shippable(Expr *node, bool *has_aggs); +#endif diff --git a/src/include/pgxc/remotecopy.h b/src/include/pgxc/remotecopy.h index 93368c0ada..6adb386306 100644 --- a/src/include/pgxc/remotecopy.h +++ b/src/include/pgxc/remotecopy.h @@ -16,6 +16,9 @@ #define REMOTECOPY_H #include "nodes/parsenodes.h" +#ifdef XCP +#include "pgxc/locator.h" +#endif /* * This contains the set of data necessary for remote COPY control. @@ -32,15 +35,21 @@ typedef struct RemoteCopyData { * as copy source or destination */ StringInfoData query_buf; - +#ifdef XCP + Locator *locator; /* the locator object */ + Oid dist_type; /* data type of the distribution column */ +#else /* Execution nodes for COPY */ ExecNodes *exec_nodes; +#endif /* Locator information */ RelationLocInfo *rel_loc; /* the locator key */ +#ifndef XCP int idx_dist_by_col; /* index of the distributed by column */ PGXCNodeHandle **connections; /* Involved Datanode connections */ +#endif } RemoteCopyData; /* diff --git a/src/include/pgxc/squeue.h b/src/include/pgxc/squeue.h new file mode 100644 index 0000000000..4cac658fb4 --- /dev/null +++ b/src/include/pgxc/squeue.h @@ -0,0 +1,60 @@ +/*------------------------------------------------------------------------- + * + * barrier.h + * + * Definitions for the shared queue handling + * + * + * Copyright (c) 2012-2014, TransLattice, Inc. + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ + +#ifndef SQUEUE_H +#define SQUEUE_H + +#include "postgres.h" +#include "executor/tuptable.h" +#include "nodes/pg_list.h" +#include "utils/tuplestore.h" + +extern PGDLLIMPORT int NSQueues; +extern PGDLLIMPORT int SQueueSize; + +/* Fixed size of shared queue, maybe need to be GUC configurable */ +#define SQUEUE_SIZE ((long) SQueueSize * 1024L) +/* Number of shared queues, maybe need to be GUC configurable */ +#define NUM_SQUEUES ((long) NSQueues) + +#define SQUEUE_KEYSIZE (64) + +#define SQ_CONS_SELF -1 +#define SQ_CONS_NONE -2 + +typedef struct SQueueHeader *SharedQueue; + +extern Size SharedQueueShmemSize(void); +extern void SharedQueuesInit(void); +extern void SharedQueueAcquire(const char *sqname, int ncons); +extern SharedQueue SharedQueueBind(const char *sqname, List *consNodes, + List *distNodes, int *myindex, int *consMap); +extern void SharedQueueUnBind(SharedQueue squeue); +extern void SharedQueueRelease(const char *sqname); +extern void SharedQueuesCleanup(int code, Datum arg); + +extern int SharedQueueFinish(SharedQueue squeue, TupleDesc tupDesc, + Tuplestorestate **tuplestore); + +extern void SharedQueueWrite(SharedQueue squeue, int consumerIdx, + TupleTableSlot *slot, Tuplestorestate **tuplestore, + MemoryContext tmpcxt); +extern bool SharedQueueRead(SharedQueue squeue, int consumerIdx, + TupleTableSlot *slot, bool canwait); +extern void SharedQueueReset(SharedQueue squeue, int consumerIdx); +extern int SharedQueueResetNotConnected(SharedQueue squeue); +extern bool SharedQueueCanPause(SharedQueue squeue); + +#endif diff --git a/src/include/storage/backendid.h b/src/include/storage/backendid.h index 8879e2129e..6b951b1566 100644 --- a/src/include/storage/backendid.h +++ b/src/include/storage/backendid.h @@ -4,6 +4,11 @@ * POSTGRES backend id communication definitions * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -24,4 +29,19 @@ typedef int BackendId; /* unique currently active backend identifier */ extern PGDLLIMPORT BackendId MyBackendId; /* backend id of this backend */ +#ifdef XCP +/* + * Two next variables make up distributed session id. Actual distributed + * session id is a string, which includes coordinator node name, but + * it is better to use Oid to store and compare with distributed session ids + * of other backends under the same postmaster. + */ +extern PGDLLIMPORT Oid MyCoordId; + +extern PGDLLIMPORT int MyCoordPid; + +/* BackendId of the first backend of the distributed session on the node */ +extern PGDLLIMPORT BackendId MyFirstBackendId; +#endif + #endif /* BACKENDID_H */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 7c0fb01cb4..6634554232 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -4,6 +4,11 @@ * Lightweight lock manager * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -75,6 +80,9 @@ typedef enum LWLockId BarrierLock, NodeTableLock, #endif +#ifdef XCP + SQueuesLock, +#endif RelationMappingLock, AsyncCtlLock, AsyncQueueLock, diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 768e4f89df..8d861a6cfd 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -4,6 +4,11 @@ * per-process shared memory data structures * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -91,6 +96,12 @@ struct PGPROC BackendId backendId; /* This backend's backend ID (if assigned) */ Oid databaseId; /* OID of database this backend is using */ Oid roleId; /* OID of role using this backend */ +#ifdef XCP + Oid coordId; /* Oid of originating coordinator */ + int coordPid; /* Pid of the originating session */ + BackendId firstBackendId; /* Backend ID of the first backend of + * the distributed session */ +#endif /* * While in hot standby mode, shows that a conflict signal has been sent diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index 73b3dabc9b..3bb98b0455 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -4,6 +4,11 @@ * POSTGRES process array definitions. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -81,5 +86,8 @@ extern bool CountOtherDBBackends(Oid databaseId, extern void XidCacheRemoveRunningXids(TransactionId xid, int nxids, const TransactionId *xids, TransactionId latestXid); - +#ifdef XCP +extern void GetGlobalSessionInfo(int pid, Oid *coordId, int *coordPid); +extern int GetFirstBackendId(int *numBackends, int *backends); +#endif /* XCP */ #endif /* PROCARRAY_H */ diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index a44659b064..c0127d4f25 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -4,6 +4,11 @@ * Routines for interprocess signalling * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index f8fc2b2d6e..18c8b98016 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -4,6 +4,11 @@ * storage manager switch public interface declarations. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -68,8 +73,14 @@ typedef struct SMgrRelationData typedef SMgrRelationData *SMgrRelation; +#ifdef XCP +#define SmgrIsTemp(smgr) \ + (!OidIsValid(MyCoordId) && \ + ((smgr)->smgr_rnode.backend != InvalidBackendId)) +#else #define SmgrIsTemp(smgr) \ ((smgr)->smgr_rnode.backend != InvalidBackendId) +#endif extern void smgrinit(void); extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend); diff --git a/src/include/tcop/dest.h b/src/include/tcop/dest.h index 1a8ff4a23b..cdeb8b4810 100644 --- a/src/include/tcop/dest.h +++ b/src/include/tcop/dest.h @@ -57,6 +57,11 @@ * calls in portal and cursor manipulations. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -94,6 +99,10 @@ typedef enum DestIntoRel, /* results sent to relation (SELECT INTO) */ DestCopyOut, /* results sent to COPY TO code */ DestSQLFunction /* results sent to SQL-language func mgr */ +#ifdef XCP + , + DestProducer /* results sent to a SharedQueue */ +#endif } CommandDest; /* ---------------- diff --git a/src/include/tcop/pquery.h b/src/include/tcop/pquery.h index 22aad2e96c..d91c2a76a0 100644 --- a/src/include/tcop/pquery.h +++ b/src/include/tcop/pquery.h @@ -4,6 +4,11 @@ * prototypes for pquery.c. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -42,4 +47,9 @@ extern long PortalRunFetch(Portal portal, long count, DestReceiver *dest); +#ifdef XCP +extern int AdvanceProducingPortal(Portal portal, bool can_wait); +extern void cleanupClosedProducers(void); +#endif + #endif /* PQUERY_H */ diff --git a/src/include/tcop/utility.h b/src/include/tcop/utility.h index 502406ce62..71554f8342 100644 --- a/src/include/tcop/utility.h +++ b/src/include/tcop/utility.h @@ -56,4 +56,8 @@ extern bool CommandIsReadOnly(Node *parsetree); extern void CheckRelationOwnership(RangeVar *rel, bool noCatalogs); +#ifdef PGXC +extern bool pgxc_lock_for_utility_stmt(Node *parsetree); +#endif + #endif /* UTILITY_H */ diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 571697eb87..87b3a1403a 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -4,6 +4,11 @@ * Declarations for operations on built-in types. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -541,6 +546,7 @@ extern Datum void_recv(PG_FUNCTION_ARGS); extern Datum void_send(PG_FUNCTION_ARGS); #ifdef PGXC extern Datum pgxc_node_str (PG_FUNCTION_ARGS); +extern Datum pgxc_lock_for_backup (PG_FUNCTION_ARGS); #endif extern Datum trigger_in(PG_FUNCTION_ARGS); extern Datum trigger_out(PG_FUNCTION_ARGS); @@ -661,9 +667,9 @@ extern Datum pg_get_function_result(PG_FUNCTION_ARGS); extern char *deparse_expression(Node *expr, List *dpcontext, bool forceprefix, bool showimplicit); #ifdef PGXC -extern void deparse_query(Query *query, StringInfo buf, List *parentnamespace, - bool finalise_aggs, bool sortgroup_colno); -extern void deparse_targetlist(Query *query, List *targetList, StringInfo buf); +extern List *deparse_context_for_remotequery(Alias *aliasname, Oid relid); +extern void get_query_def_from_valuesList(Query *query, StringInfo buf); +extern void deparse_query(Query *query, StringInfo buf, List *parentnamespace); #endif extern List *deparse_context_for(const char *aliasname, Oid relid); extern List *deparse_context_for_planstate(Node *planstate, List *ancestors, @@ -809,8 +815,10 @@ extern Datum text_format_nv(PG_FUNCTION_ARGS); /* version.c */ extern Datum pgsql_version(PG_FUNCTION_ARGS); #ifdef PGXC +#ifndef XCP extern Datum pgxc_version(PG_FUNCTION_ARGS); #endif +#endif /* xid.c */ extern Datum xidin(PG_FUNCTION_ARGS); @@ -1182,6 +1190,11 @@ extern Datum pg_cursor(PG_FUNCTION_ARGS); /* backend/pgxc/pool/poolutils.c */ extern Datum pgxc_pool_check(PG_FUNCTION_ARGS); extern Datum pgxc_pool_reload(PG_FUNCTION_ARGS); + +#ifdef XCP +/* backend/pgxc/cluster/stormutils.c */ +extern Datum stormdb_promote_standby(PG_FUNCTION_ARGS); +#endif #endif /* backend/access/transam/transam.c */ diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 1a28efcc3f..94a8a174ac 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -4,6 +4,11 @@ * External declarations pertaining to backend/utils/misc/guc.c and * backend/utils/misc/guc-file.l * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Copyright (c) 2000-2012, PostgreSQL Global Development Group * Written by Peter Eisentraut <[email protected]>. * @@ -225,6 +230,10 @@ extern int tcp_keepalives_idle; extern int tcp_keepalives_interval; extern int tcp_keepalives_count; +#ifdef XCP +extern char *storm_catalog_remap_string; +#endif + /* * Functions exported by guc.c */ diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index 7709a3a088..471c2492b9 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -3,6 +3,11 @@ * lsyscache.h * Convenience routines for common queries in the system catalog cache. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -69,6 +74,11 @@ extern int32 get_atttypmod(Oid relid, AttrNumber attnum); extern void get_atttypetypmodcoll(Oid relid, AttrNumber attnum, Oid *typid, int32 *typmod, Oid *collid); extern char *get_collation_name(Oid colloid); +#ifdef XCP +extern Oid get_collation_namespace(Oid colloid); +extern int32 get_collation_encoding(Oid colloid); +extern Oid get_collid(const char *collname, int32 collencoding, Oid collnsp); +#endif extern char *get_constraint_name(Oid conoid); extern Oid get_opclass_family(Oid opclass); extern Oid get_opclass_input_type(Oid opclass); @@ -166,6 +176,15 @@ extern void free_attstatsslot(Oid atttype, Datum *values, int nvalues, float4 *numbers, int nnumbers); extern char *get_namespace_name(Oid nspid); +#ifdef XCP +extern Oid get_namespaceid(const char *nspname); +extern char *get_typ_name(Oid typid); +extern Oid get_typ_namespace(Oid typid); +extern Oid get_typname_typid(const char *typname, Oid typnamespace); +extern Oid get_funcid(const char *funcname, oidvector *argtypes, Oid funcnsp); +extern Oid get_opnamespace(Oid opno); +extern Oid get_operid(const char *oprname, Oid oprleft, Oid oprright, Oid oprnsp); +#endif extern Oid get_range_subtype(Oid rangeOid); #define type_is_array(typid) (get_element_type(typid) != InvalidOid) diff --git a/src/include/utils/plancache.h b/src/include/utils/plancache.h index da66ac30b0..efe2a8e3ec 100644 --- a/src/include/utils/plancache.h +++ b/src/include/utils/plancache.h @@ -5,6 +5,11 @@ * * See plancache.c for comments. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -161,5 +166,9 @@ extern CachedPlan *GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, bool useResOwner); extern void ReleaseCachedPlan(CachedPlan *plan, bool useResOwner); +#ifdef XCP +extern void SetRemoteSubplan(CachedPlanSource *plansource, + const char *plan_string); +#endif #endif /* PLANCACHE_H */ diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h index 4833942654..5c883ace6b 100644 --- a/src/include/utils/portal.h +++ b/src/include/utils/portal.h @@ -36,6 +36,11 @@ * to look like NO SCROLL cursors. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -91,6 +96,10 @@ typedef enum PortalStrategy PORTAL_ONE_MOD_WITH, PORTAL_UTIL_SELECT, PORTAL_MULTI_QUERY +#ifdef XCP + , + PORTAL_DISTRIBUTED +#endif } PortalStrategy; /* @@ -156,6 +165,9 @@ typedef struct PortalData */ Tuplestorestate *holdStore; /* store for holdable cursors */ MemoryContext holdContext; /* memory containing holdStore */ +#ifdef XCP + MemoryContext tmpContext; /* temporary memory */ +#endif /* * atStart, atEnd and portalPos indicate the current cursor position. @@ -219,5 +231,12 @@ extern void PortalDefineQuery(Portal portal, extern Node *PortalListGetPrimaryStmt(List *stmts); extern void PortalCreateHoldStore(Portal portal); extern void PortalHashTableDeleteAll(void); +#ifdef XCP +extern void PortalCreateProducerStore(Portal portal); +extern List *getProducingPortals(void); +extern void addProducingPortal(Portal portal); +extern void removeProducingPortal(Portal portal); +extern bool portalIsProducing(Portal portal); +#endif #endif /* PORTAL_H */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index ff3eaec84d..3a5b6a6053 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -4,6 +4,11 @@ * POSTGRES relation descriptor (a/k/a relcache entry) definitions. * * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 2010-2012 Postgres-XC Development Group @@ -26,6 +31,9 @@ #endif #include "rewrite/prs2lock.h" #include "storage/block.h" +#ifdef XCP +#include "storage/proc.h" +#endif #include "storage/relfilenode.h" #include "utils/relcache.h" #include "utils/reltrigger.h" @@ -366,15 +374,14 @@ typedef struct StdRdOptions * RelationUsesLocalBuffers * True if relation's pages are stored in local buffers. */ +#ifdef XCP +#define RelationUsesLocalBuffers(relation) \ + !OidIsValid(MyCoordId) && \ + ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP) +#else #define RelationUsesLocalBuffers(relation) \ ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP) - -/* - * RelationUsesTempNamespace - * True if relation's catalog entries live in a private namespace. - */ -#define RelationUsesTempNamespace(relation) \ - ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP) +#endif #ifdef PGXC /* @@ -385,15 +392,40 @@ typedef struct StdRdOptions #endif /* + * RelationUsesTempNamespace + * True if relation's catalog entries live in a private namespace. + */ +#define RelationUsesTempNamespace(relation) \ + ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP) + +/* * RELATION_IS_LOCAL * If a rel is either temp or newly created in the current transaction, * it can be assumed to be visible only to the current backend. * * Beware of multiple eval of argument */ +#ifdef XCP +#define RELATION_IS_LOCAL(relation) \ + ((!OidIsValid(MyCoordId) && (relation)->rd_backend == MyBackendId) || \ + (OidIsValid(MyCoordId) && (relation)->rd_backend == MyFirstBackendId) || \ + ((relation)->rd_backend == MyBackendId || \ + (relation)->rd_createSubid != InvalidSubTransactionId)) +#else #define RELATION_IS_LOCAL(relation) \ ((relation)->rd_backend == MyBackendId || \ (relation)->rd_createSubid != InvalidSubTransactionId) +#endif + +#ifdef XCP +/* + * RelationGetLocatorType + * Returns the rel's locator type. + */ +#define RelationGetLocatorType(relation) \ + ((relation)->rd_locator_info->locatorType) + +#endif /* * RELATION_IS_OTHER_TEMP @@ -401,9 +433,17 @@ typedef struct StdRdOptions * * Beware of multiple eval of argument */ +#ifdef XCP +#define RELATION_IS_OTHER_TEMP(relation) \ + (((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP && \ + (relation)->rd_backend != MyBackendId) && \ + ((!OidIsValid(MyCoordId) && (relation)->rd_backend != MyBackendId) || \ + (OidIsValid(MyCoordId) && (relation)->rd_backend != MyFirstBackendId))) +#else #define RELATION_IS_OTHER_TEMP(relation) \ ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP \ && (relation)->rd_backend != MyBackendId) +#endif /* routines in utils/cache/relcache.c */ extern void RelationIncrementReferenceCount(Relation rel); diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index 249c2407e9..6c0d024cd0 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -10,6 +10,11 @@ * amounts are sorted using temporary files and a standard external sort * algorithm. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -83,7 +88,11 @@ extern Tuplesortstate *tuplesort_begin_datum(Oid datumType, extern Tuplesortstate *tuplesort_begin_merge(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, +#ifdef XCP + ResponseCombiner *combiner, +#else RemoteQueryState *combiner, +#endif int workMem); #endif diff --git a/src/include/utils/tuplestore.h b/src/include/utils/tuplestore.h index fd2ba4c75b..53c56ceea3 100644 --- a/src/include/utils/tuplestore.h +++ b/src/include/utils/tuplestore.h @@ -21,6 +21,11 @@ * Also, we have changed the API to return tuples in TupleTableSlots, * so that there is a check to prevent attempted access to system columns. * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Portions Copyright (c) 2012-2014, TransLattice, Inc. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -82,4 +87,14 @@ extern void tuplestore_clear(Tuplestorestate *state); extern void tuplestore_end(Tuplestorestate *state); +#ifdef XCP +extern Tuplestorestate *tuplestore_begin_datarow(bool interXact, int maxKBytes, + MemoryContext tmpcxt); +extern Tuplestorestate *tuplestore_begin_message(bool interXact, int maxKBytes); +extern void tuplestore_putmessage(Tuplestorestate *state, int len, char* msg); +extern char *tuplestore_getmessage(Tuplestorestate *state, int *len); +#endif + +extern void tuplestore_collect_stat(Tuplestorestate *state, char *name); + #endif /* TUPLESTORE_H */ diff --git a/src/pl/plperl/expected/plperl_lc.out b/src/pl/plperl/expected/plperl_lc.out new file mode 100644 index 0000000000..23c5fcb486 --- /dev/null +++ b/src/pl/plperl/expected/plperl_lc.out @@ -0,0 +1,23 @@ +CREATE OR REPLACE FUNCTION perl_0x80_in(text) RETURNS BOOL AS $$ + return ($_[0] eq "abc\x80de" ? "true" : "false"); +$$ LANGUAGE plperl; +SELECT perl_0x80_in(E'abc\x80de'); +ERROR: invalid byte sequence for encoding "UTF8": 0x80 +CREATE OR REPLACE FUNCTION perl_0x80_out() RETURNS TEXT AS $$ + return "abc\x80de"; +$$ LANGUAGE plperl; +SELECT perl_0x80_out() = E'abc\x80de'; +ERROR: invalid byte sequence for encoding "UTF8": 0x80 +CREATE OR REPLACE FUNCTION perl_utf_inout(text) RETURNS TEXT AS $$ + $str = $_[0]; $code = "NotUTF8:"; $match = "ab\xe5\xb1\xb1cd"; + if (utf8::is_utf8($str)) { + $code = "UTF8:"; utf8::decode($str); $match="ab\x{5c71}cd"; + } + return ($str ne $match ? $code."DIFFER" : $code."ab\x{5ddd}cd"); +$$ LANGUAGE plperl; +SELECT encode(perl_utf_inout(E'ab\xe5\xb1\xb1cd')::bytea, 'escape') + encode +----------------------- + UTF8:ab\345\267\235cd +(1 row) + diff --git a/src/pl/plperl/expected/plperl_lc_1.out b/src/pl/plperl/expected/plperl_lc_1.out new file mode 100644 index 0000000000..ae873d4322 --- /dev/null +++ b/src/pl/plperl/expected/plperl_lc_1.out @@ -0,0 +1,31 @@ +CREATE OR REPLACE FUNCTION perl_0x80_in(text) RETURNS BOOL AS $$ + return ($_[0] eq "abc\x80de" ? "true" : "false"); +$$ LANGUAGE plperl; +SELECT perl_0x80_in(E'abc\x80de'); + perl_0x80_in +-------------- + t +(1 row) + +CREATE OR REPLACE FUNCTION perl_0x80_out() RETURNS TEXT AS $$ + return "abc\x80de"; +$$ LANGUAGE plperl; +SELECT perl_0x80_out() = E'abc\x80de'; + ?column? +---------- + t +(1 row) + +CREATE OR REPLACE FUNCTION perl_utf_inout(text) RETURNS TEXT AS $$ + $str = $_[0]; $code = "NotUTF8:"; $match = "ab\xe5\xb1\xb1cd"; + if (utf8::is_utf8($str)) { + $code = "UTF8:"; utf8::decode($str); $match="ab\x{5c71}cd"; + } + return ($str ne $match ? $code."DIFFER" : $code."ab\x{5ddd}cd"); +$$ LANGUAGE plperl; +SELECT encode(perl_utf_inout(E'ab\xe5\xb1\xb1cd')::bytea, 'escape') + encode +-------------------------- + NotUTF8:ab\345\267\235cd +(1 row) + diff --git a/src/pl/plperl/sql/plperl_lc.sql b/src/pl/plperl/sql/plperl_lc.sql new file mode 100644 index 0000000000..6c2026414e --- /dev/null +++ b/src/pl/plperl/sql/plperl_lc.sql @@ -0,0 +1,16 @@ +CREATE OR REPLACE FUNCTION perl_0x80_in(text) RETURNS BOOL AS $$ + return ($_[0] eq "abc\x80de" ? "true" : "false"); +$$ LANGUAGE plperl; +SELECT perl_0x80_in(E'abc\x80de'); +CREATE OR REPLACE FUNCTION perl_0x80_out() RETURNS TEXT AS $$ + return "abc\x80de"; +$$ LANGUAGE plperl; +SELECT perl_0x80_out() = E'abc\x80de'; +CREATE OR REPLACE FUNCTION perl_utf_inout(text) RETURNS TEXT AS $$ + $str = $_[0]; $code = "NotUTF8:"; $match = "ab\xe5\xb1\xb1cd"; + if (utf8::is_utf8($str)) { + $code = "UTF8:"; utf8::decode($str); $match="ab\x{5c71}cd"; + } + return ($str ne $match ? $code."DIFFER" : $code."ab\x{5ddd}cd"); +$$ LANGUAGE plperl; +SELECT encode(perl_utf_inout(E'ab\xe5\xb1\xb1cd')::bytea, 'escape') diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index 470586e3db..0a6ae59552 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -3048,6 +3048,17 @@ exec_stmt_execsql(PLpgSQL_execstate *estate, q->commandType == CMD_UPDATE || q->commandType == CMD_DELETE) stmt->mod_stmt = true; + /* PGXCTODO: Support a better parameter interface for XC with DMLs */ + if (q->commandType == CMD_INSERT || + q->commandType == CMD_UPDATE || + q->commandType == CMD_DELETE) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), +#ifdef XCP + errmsg("Postgres-XL does not support DML queries in PL/pgSQL"))); +#else + errmsg("Postgres-XC does not support DML queries in PL/pgSQL"))); +#endif } } } |