Support window functions a la SQL:2008.

Hitoshi Harada, with some kibitzing from Heikki and Tom.
author: Tom Lane 2008-12-28 18:54:01 +0000
committer: Tom Lane 2008-12-28 18:54:01 +0000
commit: 95b07bc7f5010233f52f9d11da74e2e5b653b0a7 (patch)
tree: 48f5858bf4eca1bfb316ef02bb959ca85f568e0a
parent: 38e9348282e9d078487147ba8a85aebec54e3a08 (diff)
92 files changed, 6716 insertions, 317 deletions
diff --git a/contrib/tsearch2/tsearch2.c b/contrib/tsearch2/tsearch2.c
index 7754f574026..bdccba787a9 100644
--- a/contrib/tsearch2/tsearch2.c
+++ b/contrib/tsearch2/tsearch2.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/contrib/tsearch2/tsearch2.c,v 1.6 2008/03/25 22:42:42 tgl Exp $
+ *	  $PostgreSQL: pgsql/contrib/tsearch2/tsearch2.c,v 1.7 2008/12/28 18:53:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -410,7 +410,15 @@ tsa_rewrite_accum(PG_FUNCTION_ARGS)
 	MemoryContext aggcontext;
 	MemoryContext oldcontext;
 
-	aggcontext = ((AggState *) fcinfo->context)->aggcontext;
+	if (fcinfo->context && IsA(fcinfo->context, AggState))
+		aggcontext = ((AggState *) fcinfo->context)->aggcontext;
+	else if (fcinfo->context && IsA(fcinfo->context, WindowAggState))
+		aggcontext = ((WindowAggState *) fcinfo->context)->wincontext;
+	else
+	{
+		elog(ERROR, "tsa_rewrite_accum called in non-aggregate context");
+		aggcontext = NULL;		/* keep compiler quiet */
+	}
 
 	if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL)
 	{
diff --git a/doc/src/sgml/advanced.sgml b/doc/src/sgml/advanced.sgml
index 2ecb2da5c56..ce8ef535dba 100644
--- a/doc/src/sgml/advanced.sgml
+++ b/doc/src/sgml/advanced.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/advanced.sgml,v 1.54 2007/02/01 00:28:16 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/advanced.sgml,v 1.55 2008/12/28 18:53:53 tgl Exp $ -->
 
  <chapter id="tutorial-advanced">
   <title>Advanced Features</title>
@@ -240,7 +240,7 @@ COMMIT;
    <para>
     <productname>PostgreSQL</> actually treats every SQL statement as being
     executed within a transaction.  If you do not issue a <command>BEGIN</>
-    command, 
+    command,
     then each individual statement has an implicit <command>BEGIN</> and
     (if successful) <command>COMMIT</> wrapped around it.  A group of
     statements surrounded by <command>BEGIN</> and <command>COMMIT</>
@@ -265,7 +265,7 @@ COMMIT;
     with <command>ROLLBACK TO</>.  All the transaction's database changes
     between defining the savepoint and rolling back to it are discarded, but
     changes earlier than the savepoint are kept.
-   </para> 
+   </para>
 
    <para>
     After rolling back to a savepoint, it continues to be defined, so you can
@@ -274,7 +274,7 @@ COMMIT;
     system can free some resources.  Keep in mind that either releasing or
     rolling back to a savepoint
     will automatically release all savepoints that were defined after it.
-   </para> 
+   </para>
 
    <para>
     All this is happening within the transaction block, so none of it
@@ -282,7 +282,7 @@ COMMIT;
     transaction block, the committed actions become visible as a unit
     to other sessions, while the rolled-back actions never become visible
     at all.
-   </para> 
+   </para>
 
    <para>
     Remembering the bank database, suppose we debit $100.00 from Alice's
@@ -317,6 +317,242 @@ COMMIT;
   </sect1>
 
 
+  <sect1 id="tutorial-window">
+   <title id="tutorial-window-title">Window Functions</title>
+
+   <indexterm zone="tutorial-window">
+    <primary>window function</primary>
+   </indexterm>
+
+   <para>
+    A <firstterm>window function</> performs a calculation across a set of
+    table rows that are somehow related to the current row.  This is comparable
+    to the type of calculation that can be done with an aggregate function.
+    But unlike regular aggregate functions, use of a window function does not
+    cause rows to become grouped into a single output row &mdash; the
+    rows retain their separate identities.  Behind the scenes, the window
+    function is able to access more than just the current row of the query
+    result.
+   </para>
+
+   <para>
+    Here is an example that shows how to compare each employee's salary
+    with the average salary in his or her department:
+
+<programlisting>
+SELECT depname, empno, salary, avg(salary) OVER (PARTITION BY depname) FROM empsalary;
+</programlisting>
+
+<screen>
+  depname  | empno | salary |          avg          
+-----------+-------+--------+-----------------------
+ develop   |    11 |   5200 | 5020.0000000000000000
+ develop   |     7 |   4200 | 5020.0000000000000000
+ develop   |     9 |   4500 | 5020.0000000000000000
+ develop   |     8 |   6000 | 5020.0000000000000000
+ develop   |    10 |   5200 | 5020.0000000000000000
+ personnel |     5 |   3500 | 3700.0000000000000000
+ personnel |     2 |   3900 | 3700.0000000000000000
+ sales     |     3 |   4800 | 4866.6666666666666667
+ sales     |     1 |   5000 | 4866.6666666666666667
+ sales     |     4 |   4800 | 4866.6666666666666667
+(10 rows)
+</screen>
+
+    The first three output columns come directly from the table
+    <structname>empsalary</>, and there is one output row for each row in the
+    table.  The fourth column represents an average taken across all the table
+    rows that have the same <structfield>depname</> value as the current row.
+    (This actually is the same function as the regular <function>avg</>
+    aggregate function, but the <literal>OVER</> clause causes it to be
+    treated as a window function and computed across an appropriate set of
+    rows.)
+   </para>
+
+   <para>
+    A window function call always contains an <literal>OVER</> clause
+    following the window function's name and argument(s).  This is what
+    syntactically distinguishes it from a regular function or aggregate
+    function.  The <literal>OVER</> clause determines exactly how the
+    rows of the query are split up for processing by the window function.
+    The <literal>PARTITION BY</> list within <literal>OVER</> specifies
+    dividing the rows into groups, or partitions, that share the same
+    values of the <literal>PARTITION BY</> expression(s).  For each row,
+    the window function is computed across the rows that fall into the
+    same partition as the current row.
+   </para>
+
+   <para>
+    Although <function>avg</> will produce the same result no matter
+    what order it processes the partition's rows in, this is not true of all
+    window functions.  When needed, you can control that order using
+    <literal>ORDER BY</> within <literal>OVER</>.  Here is an example:
+
+<programlisting>
+SELECT depname, empno, salary, rank() OVER (PARTITION BY depname ORDER BY salary DESC) FROM empsalary;
+</programlisting>
+
+<screen>
+  depname  | empno | salary | rank 
+-----------+-------+--------+------
+ develop   |     8 |   6000 |    1
+ develop   |    10 |   5200 |    2
+ develop   |    11 |   5200 |    2
+ develop   |     9 |   4500 |    4
+ develop   |     7 |   4200 |    5
+ personnel |     2 |   3900 |    1
+ personnel |     5 |   3500 |    2
+ sales     |     1 |   5000 |    1
+ sales     |     4 |   4800 |    2
+ sales     |     3 |   4800 |    2
+(10 rows)
+</screen>
+
+    As shown here, the <function>rank</> function produces a numerical rank
+    within the current row's partition for each distinct <literal>ORDER BY</>
+    value, in the order defined by the <literal>ORDER BY</> clause.
+    <function>rank</> needs no explicit parameter, because its behavior
+    is entirely determined by the <literal>OVER</> clause.
+   </para>
+
+   <para>
+    The rows considered by a window function are those of the <quote>virtual
+    table</> produced by the query's <literal>FROM</> clause as filtered by its
+    <literal>WHERE</>, <literal>GROUP BY</>, and <literal>HAVING</> clauses
+    if any.  For example, a row removed because it does not meet the
+    <literal>WHERE</> condition is not seen by any window function.
+    A query can contain multiple window functions that slice up the data
+    in different ways by means of different <literal>OVER</> clauses, but
+    they all act on the same collection of rows defined by this virtual table.
+   </para>
+
+   <para>
+    We already saw that <literal>ORDER BY</> can be omitted if the ordering
+    of rows is not important.  It is also possible to omit <literal>PARTITION
+    BY</>, in which case the window function is computed over all rows of the
+    virtual table; that is, there is one partition containing all the rows.
+   </para>
+
+   <para>
+    There is another important concept associated with window functions:
+    for each row, there is a set of rows within its partition called its
+    <firstterm>window frame</>.  When <literal>ORDER BY</> is omitted the
+    frame is always the same as the partition.  If <literal>ORDER BY</> is
+    supplied, the frame consists of all rows from the start of the partition
+    up to the current row, plus any following rows that are equal to the
+    current row according to the <literal>ORDER BY</> clause.
+     <footnote>
+      <para>
+       The SQL standard includes options to define the window frame in
+       other ways, but this definition is the only one currently supported
+       by <productname>PostgreSQL</productname>.
+      </para>
+     </footnote>
+    Many window functions act only on the rows of the window frame, rather
+    than of the whole partition.  Here is an example using <function>sum</>:
+   </para>
+
+<programlisting>
+SELECT salary, sum(salary) OVER () FROM empsalary;
+</programlisting>
+
+<screen>
+ salary |  sum  
+--------+-------
+   5200 | 47100
+   5000 | 47100
+   3500 | 47100
+   4800 | 47100
+   3900 | 47100
+   4200 | 47100
+   4500 | 47100
+   4800 | 47100
+   6000 | 47100
+   5200 | 47100
+(10 rows)
+</screen>
+
+   <para>
+    Above, since there is no <literal>ORDER BY</> in the <literal>OVER</>
+    clause, the window frame is the same as the partition, which for lack of
+    <literal>PARTITION BY</> is the whole table; in other words each sum is
+    taken over the whole table and so we get the same result for each output
+    row.  But if we add an <literal>ORDER BY</> clause, we get very different
+    results:
+   </para>
+
+<programlisting>
+SELECT salary, sum(salary) OVER (ORDER BY salary) FROM empsalary;
+</programlisting>
+
+<screen>
+ salary |  sum  
+--------+-------
+   3500 |  3500
+   3900 |  7400
+   4200 | 11600
+   4500 | 16100
+   4800 | 25700
+   4800 | 25700
+   5000 | 30700
+   5200 | 41100
+   5200 | 41100
+   6000 | 47100
+(10 rows)
+</screen>
+
+   <para>
+    Here the sum is taken from the first (lowest) salary up through the
+    current one, including any duplicates of the current one (notice the
+    results for the duplicated salaries).
+   </para>
+
+   <para>
+    Window functions are permitted only in the <literal>SELECT</literal> list
+    and the <literal>ORDER BY</> clause of the query. They are forbidden
+    elsewhere, such as in <literal>GROUP BY</>, <literal>HAVING</>
+    and <literal>WHERE</literal> clauses.  This is because they logically
+    execute after the processing of those clauses.  Also, window functions
+    execute after regular aggregate functions.  This means it is valid to
+    include an aggregate function call in the arguments of a window function,
+    but not vice versa.
+   </para>
+
+   <para>
+    If there is a need to filter or group rows after the window calculations
+    are performed, you can use a sub-select.  For example:
+
+<programlisting>
+SELECT depname, empno, salary, enroll_date
+FROM
+  (SELECT depname, empno, salary, enroll_date,
+          rank() OVER (PARTITION BY depname ORDER BY salary DESC, empno) AS pos
+     FROM empsalary
+  ) AS ss
+WHERE pos < 3;
+</programlisting>
+
+    The above query only shows the rows from the inner query having
+    <literal>rank</> less than <literal>3</>.
+   </para>
+
+   <para>
+    When a query involves multiple window functions, it is possible to write
+    out each one with a separate <literal>OVER</> clause, but this is
+    duplicative and error-prone if the same windowing behavior is wanted
+    for several functions.  Instead, each windowing behavior can be named
+    in a <literal>WINDOW</> clause and then referenced in <literal>OVER</>.
+    For example:
+
+<programlisting>
+SELECT sum(salary) OVER w, avg(salary) OVER w
+  FROM empsalary
+  WINDOW w AS (PARTITION BY depname ORDER BY salary DESC);
+</programlisting>
+   </para>
+  </sect1>
+
+
   <sect1 id="tutorial-inheritance">
    <title>Inheritance</title>
 
@@ -391,7 +627,7 @@ CREATE TABLE capitals (
 
    <para>
     For example, the  following  query finds the  names  of  all  cities,
-    including  state capitals, that are located at an altitude 
+    including  state capitals, that are located at an altitude
     over 500 feet:
 
 <programlisting>
@@ -455,7 +691,7 @@ SELECT name, altitude
 
   <sect1 id="tutorial-conclusion">
    <title>Conclusion</title>
- 
+
    <para>
     <productname>PostgreSQL</productname> has many features not
     touched upon in this tutorial introduction, which has been
diff --git a/doc/src/sgml/errcodes.sgml b/doc/src/sgml/errcodes.sgml
index 574e7f5fbad..e792a74e286 100644
--- a/doc/src/sgml/errcodes.sgml
+++ b/doc/src/sgml/errcodes.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/errcodes.sgml,v 1.25 2008/10/04 21:56:52 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/errcodes.sgml,v 1.26 2008/12/28 18:53:53 tgl Exp $ -->
 
 <appendix id="errcodes-appendix">
  <title><productname>PostgreSQL</productname> Error Codes</title>
@@ -379,6 +379,18 @@
 </row>
 
 <row>
+<entry><literal>22014</literal></entry>
+<entry>INVALID ARGUMENT FOR NTILE FUNCTION</entry>
+<entry>invalid_argument_for_ntile_function</entry>
+</row>
+
+<row>
+<entry><literal>22016</literal></entry>
+<entry>INVALID ARGUMENT FOR NTH_VALUE FUNCTION</entry>
+<entry>invalid_argument_for_nth_value_function</entry>
+</row>
+
+<row>
 <entry><literal>2201F</literal></entry>
 <entry>INVALID ARGUMENT FOR POWER FUNCTION</entry>
 <entry>invalid_argument_for_power_function</entry>
@@ -991,6 +1003,12 @@
 </row>
 
 <row>
+<entry><literal>42P20</literal></entry>
+<entry>WINDOWING ERROR</entry>
+<entry>windowing_error</entry>
+</row>
+
+<row>
 <entry><literal>42P19</literal></entry>
 <entry>INVALID RECURSION</entry>
 <entry>invalid_recursion</entry>
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index de50c0e1d56..205b71e9c9e 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.463 2008/12/19 16:25:16 petere Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.464 2008/12/28 18:53:53 tgl Exp $ -->
 
  <chapter id="functions">
   <title>Functions and Operators</title>
@@ -10149,6 +10149,278 @@ SELECT xmlagg(x) FROM (SELECT x FROM test ORDER BY y DESC) AS tab;
 
  </sect1>
 
+ <sect1 id="functions-window">
+  <title>Window Functions</title>
+
+  <indexterm zone="functions-window">
+   <primary>window function</primary>
+   <secondary>built-in</secondary>
+  </indexterm>
+
+  <para>
+   <firstterm>Window functions</firstterm> provide the ability to perform
+   calculations across sets of rows that are related to the current query
+   row.  For information about this feature see
+   <xref linkend="tutorial-window"> and
+   <xref linkend="syntax-window-functions">.
+  </para>
+
+  <para>
+   The built-in window functions are listed in
+   <xref linkend="functions-window-table">.  Note that these functions
+   <emphasis>must</> be invoked using window function syntax; that is an
+   <literal>OVER</> clause is required.
+  </para>
+
+  <para>
+   In addition to these functions, any built-in or user-defined aggregate
+   function can be used as a window function (see
+   <xref linkend="functions-aggregate"> for a list of the built-in aggregates).
+   Aggregate functions act as window functions only when an <literal>OVER</>
+   clause follows the call; otherwise they act as regular aggregates.
+  </para>
+
+  <table id="functions-window-table">
+   <title>General-Purpose Window Functions</title>
+
+   <tgroup cols="3">
+    <thead>
+     <row>
+      <entry>Function</entry>
+      <entry>Return Type</entry>
+      <entry>Description</entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry>
+       <indexterm>
+        <primary>row_number</primary>
+       </indexterm>
+       <function>row_number()</function>
+      </entry>
+      <entry>
+       <type>bigint</type>
+      </entry>
+      <entry>number of the current row within its partition, counting from 1</entry>
+     </row>
+
+     <row>
+      <entry>
+       <indexterm>
+        <primary>rank</primary>
+       </indexterm>
+       <function>rank()</function>
+      </entry>
+      <entry>
+       <type>bigint</type>
+      </entry>
+      <entry>rank of the current row with gaps; same as <function>row_number</> of its first peer</entry>
+     </row>
+
+     <row>
+      <entry>
+       <indexterm>
+        <primary>dense_rank</primary>
+       </indexterm>
+       <function>dense_rank()</function>
+      </entry>
+      <entry>
+       <type>bigint</type>
+      </entry>
+      <entry>rank of the current row without gaps; this function counts peer groups</entry>
+     </row>
+
+     <row>
+      <entry>
+       <indexterm>
+        <primary>percent_rank</primary>
+       </indexterm>
+       <function>percent_rank()</function>
+      </entry>
+      <entry>
+       <type>double precision</type>
+      </entry>
+      <entry>relative rank of the current row: (<function>rank</> - 1) / (total rows - 1)</entry>
+     </row>
+
+     <row>
+      <entry>
+       <indexterm>
+        <primary>cume_dist</primary>
+       </indexterm>
+       <function>cume_dist()</function>
+      </entry>
+      <entry>
+       <type>double precision</type>
+      </entry>
+      <entry>relative rank of the current row: (number of rows preceding or peer with current row) / (total rows)</entry>
+     </row>
+
+     <row>
+      <entry>
+       <indexterm>
+        <primary>ntile</primary>
+       </indexterm>
+       <function>ntile(<replaceable class="parameter">num_buckets</replaceable> <type>integer</>)</function>
+      </entry>
+      <entry>
+       <type>integer</type>
+      </entry>
+      <entry>integer ranging from 1 to the argument value, dividing the
+       partition as equally as possible</entry>
+     </row>
+
+     <row>
+      <entry>
+       <indexterm>
+        <primary>lag</primary>
+       </indexterm>
+       <function>
+         lag(<replaceable class="parameter">value</replaceable> <type>any</>
+             [, <replaceable class="parameter">offset</replaceable> <type>integer</>
+             [, <replaceable class="parameter">default</replaceable> <type>any</> ]])
+       </function>
+      </entry>
+      <entry>
+       <type>same type as <replaceable class="parameter">value</replaceable></type>
+      </entry>
+      <entry>
+       returns <replaceable class="parameter">value</replaceable> evaluated at
+       the row that is <replaceable class="parameter">offset</replaceable>
+       rows before the current row within the partition; if there is no such
+       row, instead return <replaceable class="parameter">default</replaceable>.
+       Both <replaceable class="parameter">offset</replaceable> and
+       <replaceable class="parameter">default</replaceable> are evaluated
+       with respect to the current row.  If omitted,
+       <replaceable class="parameter">offset</replaceable> defaults to 1 and
+       <replaceable class="parameter">default</replaceable> to null
+      </entry>
+     </row>
+
+     <row>
+      <entry>
+       <indexterm>
+        <primary>lead</primary>
+       </indexterm>
+       <function>
+         lead(<replaceable class="parameter">value</replaceable> <type>any</>
+              [, <replaceable class="parameter">offset</replaceable> <type>integer</>
+              [, <replaceable class="parameter">default</replaceable> <type>any</> ]])
+       </function>
+      </entry>
+      <entry>
+       <type>same type as <replaceable class="parameter">value</replaceable></type>
+      </entry>
+      <entry>
+       returns <replaceable class="parameter">value</replaceable> evaluated at
+       the row that is <replaceable class="parameter">offset</replaceable>
+       rows after the current row within the partition; if there is no such
+       row, instead return <replaceable class="parameter">default</replaceable>.
+       Both <replaceable class="parameter">offset</replaceable> and
+       <replaceable class="parameter">default</replaceable> are evaluated
+       with respect to the current row.  If omitted,
+       <replaceable class="parameter">offset</replaceable> defaults to 1 and
+       <replaceable class="parameter">default</replaceable> to null
+      </entry>
+     </row>
+
+     <row>
+      <entry>
+       <indexterm>
+        <primary>first_value</primary>
+       </indexterm>
+       <function>first_value(<replaceable class="parameter">value</replaceable> <type>any</>)</function>
+      </entry>
+      <entry>
+       <type>same type as <replaceable class="parameter">value</replaceable></type>
+      </entry>
+      <entry>
+       returns <replaceable class="parameter">value</replaceable> evaluated
+       at the row that is the first row of the window frame
+      </entry>
+     </row>
+
+     <row>
+      <entry>
+       <indexterm>
+        <primary>last_value</primary>
+       </indexterm>
+       <function>last_value(<replaceable class="parameter">value</replaceable> <type>any</>)</function>
+      </entry>
+      <entry>
+       <type>same type as <replaceable class="parameter">value</replaceable></type>
+      </entry>
+      <entry>
+       returns <replaceable class="parameter">value</replaceable> evaluated
+       at the row that is the last row of the window frame
+      </entry>
+     </row>
+
+     <row>
+      <entry>
+       <indexterm>
+        <primary>nth_value</primary>
+       </indexterm>
+       <function>
+         nth_value(<replaceable class="parameter">value</replaceable> <type>any</>, <replaceable class="parameter">nth</replaceable> <type>integer</>)
+       </function>
+      </entry>
+      <entry>
+       <type>same type as <replaceable class="parameter">value</replaceable></type>
+      </entry>
+      <entry>
+       returns <replaceable class="parameter">value</replaceable> evaluated
+       at the row that is the <replaceable class="parameter">nth</replaceable>
+       row of the window frame (counting from 1); null if no such row
+      </entry>
+     </row>
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   All of the functions listed in
+   <xref linkend="functions-window-table"> depend on the sort ordering
+   specified by the <literal>ORDER BY</> clause of the associated window
+   definition.  Rows that are not distinct in the <literal>ORDER BY</>
+   ordering are said to be <firstterm>peers</>; the four ranking functions
+   are defined so that they give the same answer for any two peer rows.
+  </para>
+
+  <para>
+   Note that <function>first_value</>, <function>last_value</>, and
+   <function>nth_value</> consider only the rows within the <quote>window
+   frame</>, that is the rows from the start of the partition through the
+   last peer of the current row.  This is particularly likely to give
+   unintuitive results for <function>last_value</>.
+  </para>
+
+  <para>
+   When an aggregate function is used as a window function, it aggregates
+   over the rows within the current row's window frame.  To obtain
+   aggregation over the whole partition, be sure to omit <literal>ORDER BY</>
+   from the window definition.  An aggregate used with <literal>ORDER BY</>
+   produces a <quote>running sum</> type of behavior, which may or may not
+   be what's wanted.
+  </para>
+
+  <note>
+   <para>
+    The SQL standard defines a <literal>RESPECT NULLS</> or
+    <literal>IGNORE NULLS</> option for <function>lead</>, <function>lag</>,
+    <function>first_value</>, <function>last_value</>, and
+    <function>nth_value</>.  This is not implemented in
+    <productname>PostgreSQL</productname>: the behavior is always the
+    same as the standard's default, namely <literal>RESPECT NULLS</>.
+    Likewise, the standard's <literal>FROM FIRST</> or <literal>FROM LAST</>
+    option for <function>nth_value</> is not implemented: only the
+    default <literal>FROM FIRST</> behavior is supported.
+   </para>
+  </note>
+
+ </sect1>
 
  <sect1 id="functions-subquery">
   <title>Subquery Expressions</title>
diff --git a/doc/src/sgml/queries.sgml b/doc/src/sgml/queries.sgml
index 283dd0a73dd..f1db64b273a 100644
--- a/doc/src/sgml/queries.sgml
+++ b/doc/src/sgml/queries.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/queries.sgml,v 1.50 2008/10/14 00:41:34 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/queries.sgml,v 1.51 2008/12/28 18:53:54 tgl Exp $ -->
 
 <chapter id="queries">
  <title>Queries</title>
@@ -949,6 +949,57 @@ SELECT product_id, p.name, (sum(s.units) * (p.price - p.cost)) AS profit
     5000.  Note that the aggregate expressions do not necessarily need
     to be the same in all parts of the query.
    </para>
+
+   <para>
+    If a query contains aggregate function calls, but no <literal>GROUP BY</>
+    clause, grouping still occurs: the result is a single group row (or
+    perhaps no rows at all, if the single row is then eliminated by
+    <literal>HAVING</>).
+    The same is true if it contains a <literal>HAVING</> clause, even
+    without any aggregate function calls or <literal>GROUP BY</> clause.
+   </para>
+  </sect2>
+
+  <sect2 id="queries-window">
+   <title>Window Function Processing</>
+
+   <indexterm zone="queries-window">
+    <primary>window function</primary>
+    <secondary>order of execution</>
+   </indexterm>
+
+   <para>
+    If the query contains any window functions (see
+    <xref linkend="tutorial-window"> and
+    <xref linkend="syntax-window-functions">), these functions are evaluated
+    after any grouping, aggregation, and <literal>HAVING</> filtering is
+    performed.  That is, if the query uses any aggregates, <literal>GROUP
+    BY</>, or <literal>HAVING</>, then the rows seen by the window functions
+    are the group rows instead of the original table rows from
+    <literal>FROM</>/<literal>WHERE</>.
+   </para>
+
+   <para>
+    When multiple window functions are used, all the window functions having
+    syntactically equivalent <literal>PARTITION BY</> and <literal>ORDER BY</>
+    clauses in their window definitions are guaranteed to be evaluated in a
+    single pass over the data. Therefore they will see the same sort ordering,
+    even if the <literal>ORDER BY</> does not uniquely determine an ordering.
+    However, no guarantees are made about the evaluation of functions having
+    different <literal>PARTITION BY</> or <literal>ORDER BY</> specifications.
+    (In such cases a sort step is typically required between the passes of
+    window function evaluations, and the sort is not guaranteed to preserve
+    ordering of rows that its <literal>ORDER BY</> sees as equivalent.)
+   </para>
+
+   <para>
+    Currently, use of window functions always forces sorting, and so the
+    query output will be ordered according to one or another of the window
+    functions' <literal>PARTITION BY</>/<literal>ORDER BY</> clauses.
+    It is not recommendable to rely on this, however.  Use an explicit
+    top-level <literal>ORDER BY</> clause if you want to be sure the
+    results are sorted in a particular way.
+   </para>
   </sect2>
  </sect1>
 
diff --git a/doc/src/sgml/query.sgml b/doc/src/sgml/query.sgml
index 442f9ad0068..ffc641b03ad 100644
--- a/doc/src/sgml/query.sgml
+++ b/doc/src/sgml/query.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/query.sgml,v 1.50 2007/02/01 00:28:17 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/query.sgml,v 1.51 2008/12/28 18:53:54 tgl Exp $ -->
 
  <chapter id="tutorial-sql">
   <title>The <acronym>SQL</acronym> Language</title>
@@ -621,7 +621,7 @@ SELECT W1.city, W1.temp_lo AS low, W1.temp_hi AS high,
  San Francisco |  43 |   57 | San Francisco |  46 |   50
  Hayward       |  37 |   54 | San Francisco |  46 |   50
 (2 rows)
-</programlisting>     
+</programlisting>
 
     Here we have relabeled the weather table as <literal>W1</> and
     <literal>W2</> to be able to distinguish the left and right side
@@ -651,9 +651,9 @@ SELECT *
     <indexterm><primary>min</primary></indexterm>
     <indexterm><primary>sum</primary></indexterm>
 
-    Like  most  other relational database products, 
+    Like  most  other relational database products,
     <productname>PostgreSQL</productname> supports
-    aggregate functions.
+    <firstterm>aggregate functions</>.
     An aggregate function computes a single result from multiple input rows.
     For example, there are aggregates to compute the
     <function>count</function>, <function>sum</function>,
@@ -815,7 +815,7 @@ SELECT city, max(temp_lo)
 
    <para>
     You can update existing rows using the
-    <command>UPDATE</command> command. 
+    <command>UPDATE</command> command.
     Suppose you discover the temperature readings are
     all off by 2 degrees after November 28.  You can correct the
     data as follows:
diff --git a/doc/src/sgml/ref/select.sgml b/doc/src/sgml/ref/select.sgml
index 814a6708f00..c9a386f24f3 100644
--- a/doc/src/sgml/ref/select.sgml
+++ b/doc/src/sgml/ref/select.sgml
@@ -1,5 +1,5 @@
 <!--
-$PostgreSQL: pgsql/doc/src/sgml/ref/select.sgml,v 1.112 2008/12/01 09:38:08 petere Exp $
+$PostgreSQL: pgsql/doc/src/sgml/ref/select.sgml,v 1.113 2008/12/28 18:53:54 tgl Exp $
 PostgreSQL documentation
 -->
 
@@ -39,6 +39,7 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="parameter">expression</replac
     [ WHERE <replaceable class="parameter">condition</replaceable> ]
     [ GROUP BY <replaceable class="parameter">expression</replaceable> [, ...] ]
     [ HAVING <replaceable class="parameter">condition</replaceable> [, ...] ]
+    [ WINDOW <replaceable class="parameter">window_name</replaceable> AS ( <replaceable class="parameter">window_definition</replaceable> ) [, ...] ]
     [ { UNION | INTERSECT | EXCEPT } [ ALL ] <replaceable class="parameter">select</replaceable> ]
     [ ORDER BY <replaceable class="parameter">expression</replaceable> [ ASC | DESC | USING <replaceable class="parameter">operator</replaceable> ] [ NULLS { FIRST | LAST } ] [, ...] ]
     [ LIMIT { <replaceable class="parameter">count</replaceable> | ALL } ]
@@ -566,6 +567,67 @@ HAVING <replaceable class="parameter">condition</replaceable>
    </para>
   </refsect2>
 
+  <refsect2 id="SQL-WINDOW">
+   <title id="sql-window-title"><literal>WINDOW</literal> Clause</title>
+
+   <para>
+    The optional <literal>WINDOW</literal> clause has the general form
+<synopsis>
+WINDOW <replaceable class="parameter">window_name</replaceable> AS ( <replaceable class="parameter">window_definition</replaceable> ) [, ...]
+</synopsis>
+    where <replaceable class="parameter">window_name</replaceable> is
+    a name that can be referenced from subsequent window definitions or
+    <literal>OVER</> clauses, and
+    <replaceable class="parameter">window_definition</replaceable> is
+<synopsis>
+[ <replaceable class="parameter">existing_window_name</replaceable> ]
+[ PARTITION BY <replaceable class="parameter">expression</replaceable> [, ...] ]
+[ ORDER BY <replaceable class="parameter">expression</replaceable> [ ASC | DESC | USING <replaceable class="parameter">operator</replaceable> ] [ NULLS { FIRST | LAST } ] [, ...] ]
+</synopsis>
+    The elements of the <literal>PARTITION BY</> list are interpreted in
+    the same fashion as elements of a
+    <xref linkend="sql-groupby" endterm="sql-groupby-title">, and
+    the elements of the <literal>ORDER BY</> list are interpreted in the
+    same fashion as elements of an
+    <xref linkend="sql-orderby" endterm="sql-orderby-title">.
+    The only difference is that these expressions can contain aggregate
+    function calls, which are not allowed in a regular <literal>GROUP BY</>
+    clause.  They are allowed here because windowing occurs after grouping
+    and aggregation.
+   </para>
+
+   <para>
+    If an <replaceable class="parameter">existing_window_name</replaceable>
+    is specified it must refer to an earlier entry in the <literal>WINDOW</>
+    list; the new window copies its partitioning clause from that entry,
+    as well as its ordering clause if any.  In this case the new window cannot
+    specify its own <literal>PARTITION BY</> clause, and it can specify
+    <literal>ORDER BY</> only if the copied window does not have one.
+   </para>
+
+   <para>
+    The purpose of a <literal>WINDOW</literal> clause is to specify the
+    behavior of <firstterm>window functions</> appearing in the query's
+    <xref linkend="sql-select-list" endterm="sql-select-list-title"> or
+    <xref linkend="sql-orderby" endterm="sql-orderby-title">.  These functions
+    can reference the <literal>WINDOW</literal> clause entries by name
+    in their <literal>OVER</> clauses.  A <literal>WINDOW</literal> clause
+    entry does not have to be referenced anywhere, however; if it is not
+    used in the query it is simply ignored.  It is possible to use window
+    functions without any <literal>WINDOW</literal> clause at all, since
+    a window function call can specify its window definition directly in
+    its <literal>OVER</> clause.  However, the <literal>WINDOW</literal>
+    clause saves typing when the same window definition is needed for more
+    than one window function.
+   </para>
+
+   <para>
+    Window functions are described in detail in
+    <xref linkend="tutorial-window"> and
+    <xref linkend="syntax-window-functions">.
+   </para>
+  </refsect2>
+
   <refsect2 id="sql-select-list">
    <title id="sql-select-list-title"><command>SELECT</command> List</title>
 
@@ -922,7 +984,7 @@ FETCH { FIRST | NEXT } [ <replaceable class="parameter">count</replaceable> ] {
     constants for the offset or fetch count, parentheses will be
     necessary in most cases.  If the fetch count is omitted, it
     defaults to 1.
-   </para>    
+   </para>
 
    <para>
     When using <literal>LIMIT</>, it is a good idea to use an
@@ -1388,6 +1450,19 @@ SELECT distributors.* WHERE distributors.name = 'Westward';
   </refsect2>
 
   <refsect2>
+   <title><literal>WINDOW</literal> Clause Restrictions</title>
+
+   <para>
+    The SQL standard provides for an optional <quote>framing clause</>,
+    introduced by the key word <literal>RANGE</> or <literal>ROWS</>,
+    in window definitions.  <productname>PostgreSQL</productname> does
+    not yet implement framing clauses, and always follows the
+    default framing behavior, which is equivalent to the framing clause
+    <literal>ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</>.
+   </para>
+  </refsect2>
+
+  <refsect2>
    <title><literal>LIMIT</literal> and <literal>OFFSET</literal></title>
 
    <para>
diff --git a/doc/src/sgml/ref/select_into.sgml b/doc/src/sgml/ref/select_into.sgml
index 038ae1b333c..057bfb2a9d7 100644
--- a/doc/src/sgml/ref/select_into.sgml
+++ b/doc/src/sgml/ref/select_into.sgml
@@ -1,5 +1,5 @@
 <!--
-$PostgreSQL: pgsql/doc/src/sgml/ref/select_into.sgml,v 1.43 2008/11/14 10:22:47 petere Exp $
+$PostgreSQL: pgsql/doc/src/sgml/ref/select_into.sgml,v 1.44 2008/12/28 18:53:54 tgl Exp $
 PostgreSQL documentation
 -->
 
@@ -29,6 +29,7 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="parameter">expression</replac
     [ WHERE <replaceable class="parameter">condition</replaceable> ]
     [ GROUP BY <replaceable class="parameter">expression</replaceable> [, ...] ]
     [ HAVING <replaceable class="parameter">condition</replaceable> [, ...] ]
+    [ WINDOW <replaceable class="parameter">window_name</replaceable> AS ( <replaceable class="parameter">window_definition</replaceable> ) [, ...] ]
     [ { UNION | INTERSECT | EXCEPT } [ ALL ] <replaceable class="parameter">select</replaceable> ]
     [ ORDER BY <replaceable class="parameter">expression</replaceable> [ ASC | DESC | USING <replaceable class="parameter">operator</replaceable> ] [ NULLS { FIRST | LAST } ] [, ...] ]
     [ LIMIT { <replaceable class="parameter">count</replaceable> | ALL } ]
diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml
index cca44794340..9d0833c2035 100644
--- a/doc/src/sgml/syntax.sgml
+++ b/doc/src/sgml/syntax.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.126 2008/12/09 20:52:03 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.127 2008/12/28 18:53:54 tgl Exp $ -->
 
 <chapter id="sql-syntax">
  <title>SQL Syntax</title>
@@ -1203,6 +1203,12 @@ SELECT 3 OPERATOR(pg_catalog.+) 4;
 
     <listitem>
      <para>
+      A window function call.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
       A type cast.
      </para>
     </listitem>
@@ -1445,7 +1451,7 @@ $1.somecolumn
     enclosed in parentheses:
 
 <synopsis>
-<replaceable>function</replaceable> (<optional><replaceable>expression</replaceable> <optional>, <replaceable>expression</replaceable> ... </optional></optional> )
+<replaceable>function_name</replaceable> (<optional><replaceable>expression</replaceable> <optional>, <replaceable>expression</replaceable> ... </optional></optional> )
 </synopsis>
    </para>
 
@@ -1480,7 +1486,7 @@ sqrt(2)
 <synopsis>
 <replaceable>aggregate_name</replaceable> (<replaceable>expression</replaceable> [ , ... ] )
 <replaceable>aggregate_name</replaceable> (ALL <replaceable>expression</replaceable> [ , ... ] )
-<replaceable>aggregate_name</replaceable> (DISTINCT <replaceable>expression</replaceable> [ , ... ] )
+<replaceable>aggregate_name</replaceable> (DISTINCT <replaceable>expression</replaceable>)
 <replaceable>aggregate_name</replaceable> ( * )
 </synopsis>
 
@@ -1488,7 +1494,7 @@ sqrt(2)
     defined aggregate (possibly qualified with a schema name), and
     <replaceable>expression</replaceable> is
     any value expression that does not itself contain an aggregate
-    expression.
+    expression or a window function call.
    </para>
 
    <para>
@@ -1550,6 +1556,89 @@ sqrt(2)
    </note>
   </sect2>
 
+  <sect2 id="syntax-window-functions">
+   <title>Window Function Calls</title>
+
+   <indexterm zone="syntax-window-functions">
+    <primary>window function</primary>
+    <secondary>invocation</secondary>
+   </indexterm>
+
+   <indexterm zone="syntax-window-functions">
+    <primary>OVER clause</primary>
+   </indexterm>
+
+   <para>
+    A <firstterm>window function call</firstterm> represents the application
+    of an aggregate-like function over some portion of the rows selected
+    by a query.  Unlike regular aggregate function calls, this is not tied
+    to grouping of the selected rows into a single output row &mdash; each
+    row remains separate in the query output.  However the window function
+    is able to scan all the rows that would be part of the current row's
+    group according to the grouping specification (<literal>PARTITION BY</>
+    list) of the window function call.
+    The syntax of a window function call is one of the following:
+
+<synopsis>
+<replaceable>function_name</replaceable> (<optional><replaceable>expression</replaceable> <optional>, <replaceable>expression</replaceable> ... </optional></optional>) OVER ( <replaceable class="parameter">window_definition</replaceable> )
+<replaceable>function_name</replaceable> (<optional><replaceable>expression</replaceable> <optional>, <replaceable>expression</replaceable> ... </optional></optional>) OVER <replaceable>window_name</replaceable>
+<replaceable>function_name</replaceable> ( * ) OVER ( <replaceable class="parameter">window_definition</replaceable> )
+<replaceable>function_name</replaceable> ( * ) OVER <replaceable>window_name</replaceable>
+</synopsis>
+    where <replaceable class="parameter">window_definition</replaceable>
+    has the syntax
+<synopsis>
+[ <replaceable class="parameter">window_name</replaceable> ]
+[ PARTITION BY <replaceable class="parameter">expression</replaceable> [, ...] ]
+[ ORDER BY <replaceable class="parameter">expression</replaceable> [ ASC | DESC | USING <replaceable class="parameter">operator</replaceable> ] [ NULLS { FIRST | LAST } ] [, ...] ]
+</synopsis>
+
+    Here, <replaceable>expression</replaceable> represents any value
+    expression that does not itself contain window function calls.
+    The <literal>PARTITION BY</> and <literal>ORDER BY</> lists have
+    essentially the same syntax and semantics as <literal>GROUP BY</>
+    and <literal>ORDER BY</> clauses of the whole query.
+    <replaceable>window_name</replaceable> is a reference to a named window
+    specification defined in the query's <literal>WINDOW</literal> clause.
+    Named window specifications are usually referenced with just
+    <literal>OVER</> <replaceable>window_name</replaceable>, but it is
+    also possible to write a window name inside the parentheses and then
+    optionally override its ordering clause with <literal>ORDER BY</>.
+    This latter syntax follows the same rules as modifying an existing
+    window name within the <literal>WINDOW</literal> clause; see the
+    <xref linkend="sql-select" endterm="sql-select-title"> reference
+    page for details.
+   </para>
+
+   <para>
+    The built-in window functions are described in <xref
+    linkend="functions-window-table">.  Also, any built-in or
+    user-defined aggregate function can be used as a window function.
+    Currently, there is no provision for user-defined window functions
+    other than aggregates.
+   </para>
+
+   <para>
+    The syntaxes using <literal>*</> are used for calling parameter-less
+    aggregate functions as window functions, for example
+    <literal>count(*) OVER (PARTITION BY x ORDER BY y)</>.
+    <literal>*</> is customarily not used for non-aggregate window functions.
+    Aggregate window functions, unlike normal aggregate functions, do not
+    allow <literal>DISTINCT</> to be used within the function argument list.
+   </para>
+
+   <para>
+    Window function calls are permitted only in the <literal>SELECT</literal>
+    list and the <literal>ORDER BY</> clause of the query.
+   </para>
+
+   <para>
+    More information about window functions can be found in
+    <xref linkend="tutorial-window"> and
+    <xref linkend="queries-window">.
+   </para>
+  </sect2>
+
   <sect2 id="sql-syntax-type-casts">
    <title>Type Casts</title>
 
diff --git a/doc/src/sgml/xaggr.sgml b/doc/src/sgml/xaggr.sgml
index 3c4ce19258e..b223888f9ed 100644
--- a/doc/src/sgml/xaggr.sgml
+++ b/doc/src/sgml/xaggr.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/xaggr.sgml,v 1.36 2008/11/20 21:10:44 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/xaggr.sgml,v 1.37 2008/12/28 18:53:54 tgl Exp $ -->
 
  <sect1 id="xaggr">
   <title>User-Defined Aggregates</title>
@@ -167,10 +167,13 @@ SELECT attrelid::regclass, array_accum(atttypid::regtype)
   <para>
    A function written in C can detect that it is being called as an
    aggregate transition or final function by seeing if it was passed
-   an <structname>AggState</> node as the function call <quote>context</>,
+   an <structname>AggState</> or <structname>WindowAggState</> node
+   as the function call <quote>context</>,
    for example by:
 <programlisting>
-        if (fcinfo->context &amp;&amp; IsA(fcinfo->context, AggState))
+        if (fcinfo-&gt;context &amp;&amp;
+            (IsA(fcinfo-&gt;context, AggState) ||
+             IsA(fcinfo-&gt;context, WindowAggState)))
 </programlisting>
    One reason for checking this is that when it is true, the first input
    must be a temporary transition value and can therefore safely be modified
diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c
index 2cbc19f5a06..b78bebf506f 100644
--- a/src/backend/catalog/dependency.c
+++ b/src/backend/catalog/dependency.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/dependency.c,v 1.83 2008/12/19 16:25:17 petere Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/dependency.c,v 1.84 2008/12/28 18:53:54 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1480,6 +1480,14 @@ find_expr_references_walker(Node *node,
 						   context->addrs);
 		/* fall through to examine arguments */
 	}
+	else if (IsA(node, WindowFunc))
+	{
+		WindowFunc *wfunc = (WindowFunc *) node;
+
+		add_object_address(OCLASS_PROC, wfunc->winfnoid, 0,
+						   context->addrs);
+		/* fall through to examine arguments */
+	}
 	else if (IsA(node, SubPlan))
 	{
 		/* Extra work needed here if we ever need this case */
@@ -1602,6 +1610,7 @@ find_expr_references_walker(Node *node,
 		/* query_tree_walker ignores ORDER BY etc, but we need those opers */
 		find_expr_references_walker((Node *) query->sortClause, context);
 		find_expr_references_walker((Node *) query->groupClause, context);
+		find_expr_references_walker((Node *) query->windowClause, context);
 		find_expr_references_walker((Node *) query->distinctClause, context);
 
 		/* Examine substructure of query */
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index a711143f86c..af200afaac8 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/heap.c,v 1.347 2008/11/29 00:13:21 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/heap.c,v 1.348 2008/12/28 18:53:54 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -2138,6 +2138,10 @@ cookDefault(ParseState *pstate,
 		ereport(ERROR,
 				(errcode(ERRCODE_GROUPING_ERROR),
 			 errmsg("cannot use aggregate function in default expression")));
+	if (pstate->p_hasWindowFuncs)
+		ereport(ERROR,
+				(errcode(ERRCODE_WINDOWING_ERROR),
+			 errmsg("cannot use window function in default expression")));
 
 	/*
 	 * Coerce the expression to the correct type and typmod, if given. This
@@ -2211,6 +2215,10 @@ cookConstraint(ParseState *pstate,
 		ereport(ERROR,
 				(errcode(ERRCODE_GROUPING_ERROR),
 				 errmsg("cannot use aggregate function in check constraint")));
+	if (pstate->p_hasWindowFuncs)
+		ereport(ERROR,
+				(errcode(ERRCODE_WINDOWING_ERROR),
+				 errmsg("cannot use window function in check constraint")));
 
 	return expr;
 }
diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c
index 345df0c6a95..8ff22c23c9e 100644
--- a/src/backend/catalog/pg_proc.c
+++ b/src/backend/catalog/pg_proc.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/pg_proc.c,v 1.157 2008/12/19 18:25:19 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/pg_proc.c,v 1.158 2008/12/28 18:53:54 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -80,6 +80,8 @@ ProcedureCreate(const char *procedureName,
 				float4 prorows)
 {
 	Oid			retval;
+	/* XXX we don't currently have a way to make new window functions */
+	bool		isWindowFunc = false;
 	int			parameterCount;
 	int			allParamCount;
 	Oid		   *allParams;
@@ -292,8 +294,7 @@ ProcedureCreate(const char *procedureName,
 	values[Anum_pg_proc_prorows - 1] = Float4GetDatum(prorows);
 	values[Anum_pg_proc_provariadic - 1] = ObjectIdGetDatum(variadicType);
 	values[Anum_pg_proc_proisagg - 1] = BoolGetDatum(isAgg);
-	/* XXX we don't currently have a way to make new window functions */
-	values[Anum_pg_proc_proiswindow - 1] = BoolGetDatum(false);
+	values[Anum_pg_proc_proiswindow - 1] = BoolGetDatum(isWindowFunc);
 	values[Anum_pg_proc_prosecdef - 1] = BoolGetDatum(security_definer);
 	values[Anum_pg_proc_proisstrict - 1] = BoolGetDatum(isStrict);
 	values[Anum_pg_proc_proretset - 1] = BoolGetDatum(returnsSet);
@@ -440,18 +441,31 @@ ProcedureCreate(const char *procedureName,
 			}
 		}
 
-		/* Can't change aggregate status, either */
+		/* Can't change aggregate or window-function status, either */
 		if (oldproc->proisagg != isAgg)
 		{
 			if (oldproc->proisagg)
 				ereport(ERROR,
 						(errcode(ERRCODE_WRONG_OBJECT_TYPE),
-						 errmsg("function \"%s\" is an aggregate",
+						 errmsg("function \"%s\" is an aggregate function",
+								procedureName)));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+						 errmsg("function \"%s\" is not an aggregate function",
+								procedureName)));
+		}
+		if (oldproc->proiswindow != isWindowFunc)
+		{
+			if (oldproc->proiswindow)
+				ereport(ERROR,
+						(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+						 errmsg("function \"%s\" is a window function",
 								procedureName)));
 			else
 				ereport(ERROR,
 						(errcode(ERRCODE_WRONG_OBJECT_TYPE),
-						 errmsg("function \"%s\" is not an aggregate",
+						 errmsg("function \"%s\" is not a window function",
 								procedureName)));
 		}
 
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index e5f1b313076..d829cb19235 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994-5, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/explain.c,v 1.181 2008/11/19 01:10:23 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/explain.c,v 1.182 2008/12/28 18:53:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -590,6 +590,9 @@ explain_outNode(StringInfo str,
 					break;
 			}
 			break;
+		case T_WindowAgg:
+			pname = "WindowAgg";
+			break;
 		case T_Unique:
 			pname = "Unique";
 			break;
diff --git a/src/backend/commands/functioncmds.c b/src/backend/commands/functioncmds.c
index 0a3de53e1e5..8963f981178 100644
--- a/src/backend/commands/functioncmds.c
+++ b/src/backend/commands/functioncmds.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/functioncmds.c,v 1.103 2008/12/18 18:20:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/functioncmds.c,v 1.104 2008/12/28 18:53:55 tgl Exp $
  *
  * DESCRIPTION
  *	  These routines take the parse tree and pick out the
@@ -321,6 +321,10 @@ examine_parameter_list(List *parameters, Oid languageOid,
 				ereport(ERROR,
 						(errcode(ERRCODE_GROUPING_ERROR),
 						 errmsg("cannot use aggregate function in parameter default value")));
+			if (pstate->p_hasWindowFuncs)
+				ereport(ERROR,
+						(errcode(ERRCODE_WINDOWING_ERROR),
+						 errmsg("cannot use window function in parameter default value")));
 
 			*parameterDefaults = lappend(*parameterDefaults, def);
 			have_defaults = true;
@@ -1538,6 +1542,10 @@ CreateCast(CreateCastStmt *stmt)
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 				 errmsg("cast function must not be an aggregate function")));
+		if (procstruct->proiswindow)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+				 errmsg("cast function must not be a window function")));
 		if (procstruct->proretset)
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c
index 920b328bb30..f1f87abe227 100644
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -10,7 +10,7 @@
  * Copyright (c) 2002-2008, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/prepare.c,v 1.93 2008/12/13 02:29:21 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/prepare.c,v 1.94 2008/12/28 18:53:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -347,6 +347,10 @@ EvaluateParams(PreparedStatement *pstmt, List *params,
 			ereport(ERROR,
 					(errcode(ERRCODE_GROUPING_ERROR),
 			  errmsg("cannot use aggregate function in EXECUTE parameter")));
+		if (pstate->p_hasWindowFuncs)
+			ereport(ERROR,
+					(errcode(ERRCODE_WINDOWING_ERROR),
+			  errmsg("cannot use window function in EXECUTE parameter")));
 
 		given_type_id = exprType(expr);
 
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 9f34c735028..173b24dab82 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/tablecmds.c,v 1.274 2008/12/15 21:35:31 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/tablecmds.c,v 1.275 2008/12/28 18:53:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -5506,6 +5506,10 @@ ATPrepAlterColumnType(List **wqueue,
 			ereport(ERROR,
 					(errcode(ERRCODE_GROUPING_ERROR),
 			errmsg("cannot use aggregate function in transform expression")));
+		if (pstate->p_hasWindowFuncs)
+			ereport(ERROR,
+					(errcode(ERRCODE_WINDOWING_ERROR),
+			errmsg("cannot use window function in transform expression")));
 	}
 	else
 	{
diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c
index 38416fa67f2..f99ed813954 100644
--- a/src/backend/commands/typecmds.c
+++ b/src/backend/commands/typecmds.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/typecmds.c,v 1.127 2008/11/30 19:01:29 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/typecmds.c,v 1.128 2008/12/28 18:53:55 tgl Exp $
  *
  * DESCRIPTION
  *	  The "DefineFoo" routines take the parse tree and pick out the
@@ -2255,6 +2255,10 @@ domainAddConstraint(Oid domainOid, Oid domainNamespace, Oid baseTypeOid,
 		ereport(ERROR,
 				(errcode(ERRCODE_GROUPING_ERROR),
 			   errmsg("cannot use aggregate function in check constraint")));
+	if (pstate->p_hasWindowFuncs)
+		ereport(ERROR,
+				(errcode(ERRCODE_WINDOWING_ERROR),
+				 errmsg("cannot use window function in check constraint")));
 
 	/*
 	 * Convert to string form for storage.
diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile
index b4a0492751c..63c86107782 100644
--- a/src/backend/executor/Makefile
+++ b/src/backend/executor/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for executor
 #
 # IDENTIFICATION
-#    $PostgreSQL: pgsql/src/backend/executor/Makefile,v 1.28 2008/10/04 21:56:52 tgl Exp $
+#    $PostgreSQL: pgsql/src/backend/executor/Makefile,v 1.29 2008/12/28 18:53:55 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -22,6 +22,6 @@ OBJS = execAmi.o execCurrent.o execGrouping.o execJunk.o execMain.o \
        nodeSeqscan.o nodeSetOp.o nodeSort.o nodeUnique.o \
        nodeValuesscan.o nodeCtescan.o nodeWorktablescan.o \
        nodeLimit.o nodeGroup.o nodeSubplan.o nodeSubqueryscan.o nodeTidscan.o \
-       tstoreReceiver.o spi.o
+       nodeWindowAgg.o tstoreReceiver.o spi.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index ef4f6853899..d406a0cec9a 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- *	$PostgreSQL: pgsql/src/backend/executor/execAmi.c,v 1.101 2008/10/28 17:13:51 tgl Exp $
+ *	$PostgreSQL: pgsql/src/backend/executor/execAmi.c,v 1.102 2008/12/28 18:53:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -20,6 +20,7 @@
 #include "executor/nodeBitmapHeapscan.h"
 #include "executor/nodeBitmapIndexscan.h"
 #include "executor/nodeBitmapOr.h"
+#include "executor/nodeCtescan.h"
 #include "executor/nodeFunctionscan.h"
 #include "executor/nodeGroup.h"
 #include "executor/nodeGroup.h"
@@ -40,7 +41,7 @@
 #include "executor/nodeTidscan.h"
 #include "executor/nodeUnique.h"
 #include "executor/nodeValuesscan.h"
-#include "executor/nodeCtescan.h"
+#include "executor/nodeWindowAgg.h"
 #include "executor/nodeWorktablescan.h"
 #include "nodes/nodeFuncs.h"
 #include "utils/syscache.h"
@@ -210,6 +211,10 @@ ExecReScan(PlanState *node, ExprContext *exprCtxt)
 			ExecReScanAgg((AggState *) node, exprCtxt);
 			break;
 
+		case T_WindowAggState:
+			ExecReScanWindowAgg((WindowAggState *) node, exprCtxt);
+			break;
+
 		case T_UniqueState:
 			ExecReScanUnique((UniqueState *) node, exprCtxt);
 			break;
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index e689ec00f8c..cd610c895c1 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -12,7 +12,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/execProcnode.c,v 1.63 2008/10/04 21:56:53 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/execProcnode.c,v 1.64 2008/12/28 18:53:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -85,6 +85,7 @@
 #include "executor/nodeBitmapHeapscan.h"
 #include "executor/nodeBitmapIndexscan.h"
 #include "executor/nodeBitmapOr.h"
+#include "executor/nodeCtescan.h"
 #include "executor/nodeFunctionscan.h"
 #include "executor/nodeGroup.h"
 #include "executor/nodeHash.h"
@@ -104,7 +105,7 @@
 #include "executor/nodeTidscan.h"
 #include "executor/nodeUnique.h"
 #include "executor/nodeValuesscan.h"
-#include "executor/nodeCtescan.h"
+#include "executor/nodeWindowAgg.h"
 #include "executor/nodeWorktablescan.h"
 #include "miscadmin.h"
 
@@ -260,6 +261,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
 											   estate, eflags);
 			break;
 
+		case T_WindowAgg:
+			result = (PlanState *) ExecInitWindowAgg((WindowAgg *) node,
+													 estate, eflags);
+			break;
+
 		case T_Unique:
 			result = (PlanState *) ExecInitUnique((Unique *) node,
 												  estate, eflags);
@@ -425,6 +431,10 @@ ExecProcNode(PlanState *node)
 			result = ExecAgg((AggState *) node);
 			break;
 
+		case T_WindowAggState:
+			result = ExecWindowAgg((WindowAggState *) node);
+			break;
+
 		case T_UniqueState:
 			result = ExecUnique((UniqueState *) node);
 			break;
@@ -601,6 +611,10 @@ ExecCountSlotsNode(Plan *node)
 		case T_Agg:
 			return ExecCountSlotsAgg((Agg *) node);
 
+		case T_WindowAgg:
+			return ExecCountSlotsWindowAgg((WindowAgg *) node);
+			break;
+
 		case T_Unique:
 			return ExecCountSlotsUnique((Unique *) node);
 
@@ -749,6 +763,10 @@ ExecEndNode(PlanState *node)
 			ExecEndAgg((AggState *) node);
 			break;
 
+		case T_WindowAggState:
+			ExecEndWindowAgg((WindowAggState *) node);
+			break;
+
 		case T_UniqueState:
 			ExecEndUnique((UniqueState *) node);
 			break;
diff --git a/src/backend/executor/execQual.c b/src/backend/executor/execQual.c
index 71aad49647d..17606f5204e 100644
--- a/src/backend/executor/execQual.c
+++ b/src/backend/executor/execQual.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/execQual.c,v 1.238 2008/12/18 19:38:22 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/execQual.c,v 1.239 2008/12/28 18:53:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,6 +62,9 @@ static Datum ExecEvalArrayRef(ArrayRefExprState *astate,
 static Datum ExecEvalAggref(AggrefExprState *aggref,
 			   ExprContext *econtext,
 			   bool *isNull, ExprDoneCond *isDone);
+static Datum ExecEvalWindowFunc(WindowFuncExprState *wfunc,
+			   ExprContext *econtext,
+			   bool *isNull, ExprDoneCond *isDone);
 static Datum ExecEvalVar(ExprState *exprstate, ExprContext *econtext,
 			bool *isNull, ExprDoneCond *isDone);
 static Datum ExecEvalScalarVar(ExprState *exprstate, ExprContext *econtext,
@@ -444,6 +447,27 @@ ExecEvalAggref(AggrefExprState *aggref, ExprContext *econtext,
 }
 
 /* ----------------------------------------------------------------
+ *		ExecEvalWindowFunc
+ *
+ *		Returns a Datum whose value is the value of the precomputed
+ *		window function found in the given expression context.
+ * ----------------------------------------------------------------
+ */
+static Datum
+ExecEvalWindowFunc(WindowFuncExprState *wfunc, ExprContext *econtext,
+				   bool *isNull, ExprDoneCond *isDone)
+{
+	if (isDone)
+		*isDone = ExprSingleResult;
+
+	if (econtext->ecxt_aggvalues == NULL)		/* safety check */
+		elog(ERROR, "no window functions in this expression context");
+
+	*isNull = econtext->ecxt_aggnulls[wfunc->wfuncno];
+	return econtext->ecxt_aggvalues[wfunc->wfuncno];
+}
+
+/* ----------------------------------------------------------------
  *		ExecEvalVar
  *
  *		Returns a Datum whose value is the value of a range
@@ -4062,12 +4086,12 @@ ExecEvalExprSwitchContext(ExprState *expression,
  * executions of the expression are needed.  Typically the context will be
  * the same as the per-query context of the associated ExprContext.
  *
- * Any Aggref and SubPlan nodes found in the tree are added to the lists
- * of such nodes held by the parent PlanState.	Otherwise, we do very little
- * initialization here other than building the state-node tree.  Any nontrivial
- * work associated with initializing runtime info for a node should happen
- * during the first actual evaluation of that node.  (This policy lets us
- * avoid work if the node is never actually evaluated.)
+ * Any Aggref, WindowFunc, or SubPlan nodes found in the tree are added to the
+ * lists of such nodes held by the parent PlanState. Otherwise, we do very
+ * little initialization here other than building the state-node tree.  Any
+ * nontrivial work associated with initializing runtime info for a node should
+ * happen during the first actual evaluation of that node.  (This policy lets
+ * us avoid work if the node is never actually evaluated.)
  *
  * Note: there is no ExecEndExpr function; we assume that any resource
  * cleanup needed will be handled by just releasing the memory context
@@ -4145,11 +4169,49 @@ ExecInitExpr(Expr *node, PlanState *parent)
 				else
 				{
 					/* planner messed up */
-					elog(ERROR, "aggref found in non-Agg plan node");
+					elog(ERROR, "Aggref found in non-Agg plan node");
 				}
 				state = (ExprState *) astate;
 			}
 			break;
+		case T_WindowFunc:
+			{
+				WindowFunc *wfunc = (WindowFunc *) node;
+				WindowFuncExprState *wfstate = makeNode(WindowFuncExprState);
+
+				wfstate->xprstate.evalfunc = (ExprStateEvalFunc) ExecEvalWindowFunc;
+				if (parent && IsA(parent, WindowAggState))
+				{
+					WindowAggState *winstate = (WindowAggState *) parent;
+					int			nfuncs;
+
+					winstate->funcs = lcons(wfstate, winstate->funcs);
+					nfuncs = ++winstate->numfuncs;
+					if (wfunc->winagg)
+						winstate->numaggs++;
+
+					wfstate->args = (List *) ExecInitExpr((Expr *) wfunc->args,
+														  parent);
+
+					/*
+					 * Complain if the windowfunc's arguments contain any
+					 * windowfuncs; nested window functions are semantically
+					 * nonsensical.  (This should have been caught earlier,
+					 * but we defend against it here anyway.)
+					 */
+					if (nfuncs != winstate->numfuncs)
+						ereport(ERROR,
+								(errcode(ERRCODE_WINDOWING_ERROR),
+						errmsg("window function calls cannot be nested")));
+				}
+				else
+				{
+					/* planner messed up */
+					elog(ERROR, "WindowFunc found in non-WindowAgg plan node");
+				}
+				state = (ExprState *) wfstate;
+			}
+			break;
 		case T_ArrayRef:
 			{
 				ArrayRef   *aref = (ArrayRef *) node;
diff --git a/src/backend/executor/nodeWindowAgg.c b/src/backend/executor/nodeWindowAgg.c
new file mode 100644
index 00000000000..37ef9a5e830
--- /dev/null
+++ b/src/backend/executor/nodeWindowAgg.c
@@ -0,0 +1,1854 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeWindowAgg.c
+ *	  routines to handle WindowAgg nodes.
+ *
+ * A WindowAgg node evaluates "window functions" across suitable partitions
+ * of the input tuple set.  Any one WindowAgg works for just a single window
+ * specification, though it can evaluate multiple window functions sharing
+ * identical window specifications.  The input tuples are required to be
+ * delivered in sorted order, with the PARTITION BY columns (if any) as
+ * major sort keys and the ORDER BY columns (if any) as minor sort keys.
+ * (The planner generates a stack of WindowAggs with intervening Sort nodes
+ * as needed, if a query involves more than one window specification.)
+ *
+ * Since window functions can require access to any or all of the rows in
+ * the current partition, we accumulate rows of the partition into a
+ * tuplestore.  The window functions are called using the WindowObject API
+ * so that they can access those rows as needed.
+ *
+ * We also support using plain aggregate functions as window functions.
+ * For these, the regular Agg-node environment is emulated for each partition.
+ * As required by the SQL spec, the output represents the value of the
+ * aggregate function over all rows in the current row's window frame.
+ *
+ *
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeWindowAgg.c,v 1.1 2008/12/28 18:53:55 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "catalog/pg_aggregate.h"
+#include "catalog/pg_proc.h"
+#include "catalog/pg_type.h"
+#include "executor/executor.h"
+#include "executor/nodeWindowAgg.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/clauses.h"
+#include "parser/parse_agg.h"
+#include "parser/parse_coerce.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/syscache.h"
+#include "windowapi.h"
+
+/*
+ * All the window function APIs are called with this object, which is passed
+ * to window functions as fcinfo->context.
+ */
+typedef struct WindowObjectData
+{
+	NodeTag		type;
+	WindowAggState *winstate;	/* parent WindowAggState */
+	List	   *argstates;		/* ExprState trees for fn's arguments */
+	void	   *localmem;		/* WinGetPartitionLocalMemory's chunk */
+	int			markptr;		/* tuplestore mark pointer for this fn */
+	int			readptr;		/* tuplestore read pointer for this fn */
+	int64		markpos;		/* row that markptr is positioned on */
+	int64		seekpos;		/* row that readptr is positioned on */
+} WindowObjectData;
+
+/*
+ * We have one WindowStatePerFunc struct for each window function and
+ * window aggregate handled by this node.
+ */
+typedef struct WindowStatePerFuncData
+{
+	/* Links to WindowFunc expr and state nodes this working state is for */
+	WindowFuncExprState *wfuncstate;
+	WindowFunc	   *wfunc;
+
+	int			numArguments;	/* number of arguments */
+
+	FmgrInfo	flinfo;			/* fmgr lookup data for window function */
+
+	/*
+	 * We need the len and byval info for the result of each function
+	 * in order to know how to copy/delete values.
+	 */
+	int16		resulttypeLen;
+	bool		resulttypeByVal;
+
+	bool		plain_agg;		/* is it just a plain aggregate function? */
+	int			aggno;			/* if so, index of its PerAggData */
+
+	WindowObject	winobj;		/* object used in window function API */
+} WindowStatePerFuncData;
+
+/*
+ * For plain aggregate window functions, we also have one of these.
+ */
+typedef struct WindowStatePerAggData
+{
+	/* Oids of transfer functions */
+	Oid			transfn_oid;
+	Oid			finalfn_oid;	/* may be InvalidOid */
+
+	/*
+	 * fmgr lookup data for transfer functions --- only valid when
+	 * corresponding oid is not InvalidOid.  Note in particular that fn_strict
+	 * flags are kept here.
+	 */
+	FmgrInfo	transfn;
+	FmgrInfo	finalfn;
+
+	/*
+	 * initial value from pg_aggregate entry
+	 */
+	Datum		initValue;
+	bool		initValueIsNull;
+
+	/*
+	 * cached value for non-moving frame
+	 */
+	Datum		resultValue;
+	bool		resultValueIsNull;
+	bool		hasResult;
+
+	/*
+	 * We need the len and byval info for the agg's input, result, and
+	 * transition data types in order to know how to copy/delete values.
+	 */
+	int16		inputtypeLen,
+				resulttypeLen,
+				transtypeLen;
+	bool		inputtypeByVal,
+				resulttypeByVal,
+				transtypeByVal;
+
+	int			wfuncno;		/* index of associated PerFuncData */
+
+	/* Current transition value */
+	Datum		transValue;		/* current transition value */
+	bool		transValueIsNull;
+
+	bool		noTransValue;	/* true if transValue not set yet */
+} WindowStatePerAggData;
+
+static void initialize_windowaggregate(WindowAggState *winstate,
+									   WindowStatePerFunc perfuncstate,
+									   WindowStatePerAgg peraggstate);
+static void advance_windowaggregate(WindowAggState *winstate,
+									WindowStatePerFunc perfuncstate,
+									WindowStatePerAgg peraggstate);
+static void finalize_windowaggregate(WindowAggState *winstate,
+									 WindowStatePerFunc perfuncstate,
+									 WindowStatePerAgg peraggstate,
+									 Datum *result, bool *isnull);
+
+static void eval_windowaggregates(WindowAggState *winstate);
+static void eval_windowfunction(WindowAggState *winstate,
+								WindowStatePerFunc perfuncstate,
+								Datum *result, bool *isnull);
+
+static void begin_partition(WindowAggState *winstate);
+static void spool_tuples(WindowAggState *winstate, int64 pos);
+static void release_partition(WindowAggState *winstate);
+
+static WindowStatePerAggData *initialize_peragg(WindowAggState *winstate,
+												WindowFunc *wfunc,
+												WindowStatePerAgg peraggstate);
+static Datum GetAggInitVal(Datum textInitVal, Oid transtype);
+
+static bool are_peers(WindowAggState *winstate, TupleTableSlot *slot1,
+					  TupleTableSlot *slot2);
+static bool window_gettupleslot(WindowObject winobj, int64 pos,
+								TupleTableSlot *slot);
+
+
+/*
+ * initialize_windowaggregate
+ * parallel to initialize_aggregate in nodeAgg.c
+ */
+static void
+initialize_windowaggregate(WindowAggState *winstate,
+						   WindowStatePerFunc perfuncstate,
+						   WindowStatePerAgg peraggstate)
+{
+	MemoryContext		oldContext;
+
+	if (peraggstate->initValueIsNull)
+		peraggstate->transValue = peraggstate->initValue;
+	else
+	{
+		oldContext = MemoryContextSwitchTo(winstate->wincontext);
+		peraggstate->transValue = datumCopy(peraggstate->initValue,
+											peraggstate->transtypeByVal,
+											peraggstate->transtypeLen);
+		MemoryContextSwitchTo(oldContext);
+	}
+	peraggstate->transValueIsNull = peraggstate->initValueIsNull;
+	peraggstate->noTransValue = peraggstate->initValueIsNull;
+}
+
+/*
+ * advance_windowaggregate
+ * parallel to advance_aggregate in nodeAgg.c
+ */
+static void
+advance_windowaggregate(WindowAggState *winstate,
+						WindowStatePerFunc perfuncstate,
+						WindowStatePerAgg peraggstate)
+{
+	WindowFuncExprState	   *wfuncstate = perfuncstate->wfuncstate;
+	int						numArguments = perfuncstate->numArguments;
+	FunctionCallInfoData	fcinfodata;
+	FunctionCallInfo		fcinfo = &fcinfodata;
+	Datum					newVal;
+	ListCell			   *arg;
+	int						i;
+	MemoryContext			oldContext;
+	ExprContext *econtext = winstate->tmpcontext;
+
+	oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+
+	/* We start from 1, since the 0th arg will be the transition value */
+	i = 1;
+	foreach(arg, wfuncstate->args)
+	{
+		ExprState	   *argstate = (ExprState *) lfirst(arg);
+
+		fcinfo->arg[i] = ExecEvalExpr(argstate, econtext,
+									  &fcinfo->argnull[i], NULL);
+		i++;
+	}
+
+	if (peraggstate->transfn.fn_strict)
+	{
+		/*
+		 * For a strict transfn, nothing happens when there's a NULL input; we
+		 * just keep the prior transValue.
+		 */
+		for (i = 1; i <= numArguments; i++)
+		{
+			if (fcinfo->argnull[i])
+			{
+				MemoryContextSwitchTo(oldContext);
+				return;
+			}
+		}
+		if (peraggstate->noTransValue)
+		{
+			/*
+			 * transValue has not been initialized. This is the first non-NULL
+			 * input value. We use it as the initial value for transValue. (We
+			 * already checked that the agg's input type is binary-compatible
+			 * with its transtype, so straight copy here is OK.)
+			 *
+			 * We must copy the datum into wincontext if it is pass-by-ref. We
+			 * do not need to pfree the old transValue, since it's NULL.
+			 */
+			MemoryContextSwitchTo(winstate->wincontext);
+			peraggstate->transValue = datumCopy(fcinfo->arg[1],
+											 peraggstate->transtypeByVal,
+											 peraggstate->transtypeLen);
+			peraggstate->transValueIsNull = false;
+			peraggstate->noTransValue = false;
+			MemoryContextSwitchTo(oldContext);
+			return;
+		}
+		if (peraggstate->transValueIsNull)
+		{
+			/*
+			 * Don't call a strict function with NULL inputs.  Note it is
+			 * possible to get here despite the above tests, if the transfn is
+			 * strict *and* returned a NULL on a prior cycle. If that happens
+			 * we will propagate the NULL all the way to the end.
+			 */
+			MemoryContextSwitchTo(oldContext);
+			return;
+		}
+	}
+
+	/*
+	 * OK to call the transition function
+	 */
+	InitFunctionCallInfoData(*fcinfo, &(peraggstate->transfn),
+							 numArguments + 1,
+							 (void *) winstate, NULL);
+	fcinfo->arg[0] = peraggstate->transValue;
+	fcinfo->argnull[0] = peraggstate->transValueIsNull;
+	newVal = FunctionCallInvoke(fcinfo);
+
+	/*
+	 * If pass-by-ref datatype, must copy the new value into wincontext and
+	 * pfree the prior transValue.	But if transfn returned a pointer to its
+	 * first input, we don't need to do anything.
+	 */
+	if (!peraggstate->transtypeByVal &&
+		DatumGetPointer(newVal) != DatumGetPointer(peraggstate->transValue))
+	{
+		if (!fcinfo->isnull)
+		{
+			MemoryContextSwitchTo(winstate->wincontext);
+			newVal = datumCopy(newVal,
+							   peraggstate->transtypeByVal,
+							   peraggstate->transtypeLen);
+		}
+		if (!peraggstate->transValueIsNull)
+			pfree(DatumGetPointer(peraggstate->transValue));
+	}
+
+	MemoryContextSwitchTo(oldContext);
+	peraggstate->transValue = newVal;
+	peraggstate->transValueIsNull = fcinfo->isnull;
+}
+
+/*
+ * finalize_windowaggregate
+ * parallel to finalize_aggregate in nodeAgg.c
+ */
+static void
+finalize_windowaggregate(WindowAggState *winstate,
+						 WindowStatePerFunc perfuncstate,
+						 WindowStatePerAgg peraggstate,
+						 Datum *result, bool *isnull)
+{
+	MemoryContext			oldContext;
+
+	oldContext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
+
+	/*
+	 * Apply the agg's finalfn if one is provided, else return transValue.
+	 */
+	if (OidIsValid(peraggstate->finalfn_oid))
+	{
+		FunctionCallInfoData	fcinfo;
+
+		InitFunctionCallInfoData(fcinfo, &(peraggstate->finalfn), 1,
+								 (void *) winstate, NULL);
+		fcinfo.arg[0] = peraggstate->transValue;
+		fcinfo.argnull[0] = peraggstate->transValueIsNull;
+		if (fcinfo.flinfo->fn_strict && peraggstate->transValueIsNull)
+		{
+			/* don't call a strict function with NULL inputs */
+			*result = (Datum) 0;
+			*isnull = true;
+		}
+		else
+		{
+			*result = FunctionCallInvoke(&fcinfo);
+			*isnull = fcinfo.isnull;
+		}
+	}
+	else
+	{
+		*result = peraggstate->transValue;
+		*isnull = peraggstate->transValueIsNull;
+	}
+
+	/*
+	 * If result is pass-by-ref, make sure it is in the right context.
+	 */
+	if (!peraggstate->resulttypeByVal && !*isnull &&
+		!MemoryContextContains(CurrentMemoryContext,
+							   DatumGetPointer(*result)))
+		*result = datumCopy(*result,
+							peraggstate->resulttypeByVal,
+							peraggstate->resulttypeLen);
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * eval_windowaggregates
+ * evaluate plain aggregates being used as window functions
+ *
+ * Much of this is duplicated from nodeAgg.c.  But NOTE that we expect to be
+ * able to call aggregate final functions repeatedly after aggregating more
+ * data onto the same transition value.  This is not a behavior required by
+ * nodeAgg.c.
+ */
+static void
+eval_windowaggregates(WindowAggState *winstate)
+{
+	WindowStatePerAgg   peraggstate;
+	int					wfuncno, numaggs;
+	int					i;
+	MemoryContext		oldContext;
+	ExprContext		   *econtext;
+	TupleTableSlot	   *first_peer_slot = winstate->first_peer_slot;
+	TupleTableSlot	   *slot;
+	bool				first;
+
+	numaggs = winstate->numaggs;
+	if (numaggs == 0)
+		return;					/* nothing to do */
+
+	/* final output execution is in ps_ExprContext */
+	econtext = winstate->ss.ps.ps_ExprContext;
+
+	/*
+	 * We don't currently support explicitly-specified window frames.  That
+	 * means that the window frame always includes all the rows in the
+	 * partition preceding and including the current row, and all its
+	 * peers. As a special case, if there's no ORDER BY, all rows are peers,
+	 * so the window frame includes all rows in the partition.
+	 *
+	 * When there's peer rows, all rows in a peer group will have the same
+	 * aggregate values.  The values will be calculated when current position
+	 * reaches the first peer row, and on all the following peer rows we will
+	 * just return the saved results.
+	 *
+	 * 'aggregatedupto' keeps track of the last row that has already been
+	 * accumulated for the aggregates. When the current row has no peers,
+	 * aggregatedupto will be the same as the current row after this
+	 * function. If there are peer rows, all peers will be accumulated in one
+	 * call of this function, and aggregatedupto will be ahead of the current
+	 * position. If there's no ORDER BY, and thus all rows are peers, the
+	 * first call will aggregate all rows in the partition.
+	 *
+	 * TODO: In the future, we could implement sliding frames by recalculating
+	 * the aggregate whenever a row exits the frame. That would be pretty
+	 * slow, though. For aggregates like SUM and COUNT we could implement a
+	 * "negative transition function" that would be called for all the rows
+	 * that exit the frame.
+	 */
+
+	/*
+	 * If we've already aggregated up through current row, reuse the
+	 * saved result values
+	 */
+	if (winstate->aggregatedupto > winstate->currentpos)
+	{
+		for (i = 0; i < numaggs; i++)
+		{
+			peraggstate = &winstate->peragg[i];
+			wfuncno = peraggstate->wfuncno;
+			econtext->ecxt_aggvalues[wfuncno] = peraggstate->resultValue;
+			econtext->ecxt_aggnulls[wfuncno] = peraggstate->resultValueIsNull;
+		}
+		return;
+	}
+
+	/* Initialize aggregates on first call for partition */
+	for (i = 0; i < numaggs; i++)
+	{
+		peraggstate = &winstate->peragg[i];
+		wfuncno = peraggstate->wfuncno;
+		if (!peraggstate->hasResult)
+			initialize_windowaggregate(winstate,
+									   &winstate->perfunc[wfuncno],
+									   &winstate->peragg[i]);
+	}
+
+	/*
+	 * If this is the first call for this partition, fetch the first row
+	 * for comparing peer rows. On subsequent calls, we'll always read
+	 * ahead until we reach the first non-peer row, and store that row in
+	 * first_peer_slot, for use in the next call.
+	 */
+	if (TupIsNull(first_peer_slot))
+	{
+		spool_tuples(winstate, winstate->aggregatedupto);
+		tuplestore_select_read_pointer(winstate->buffer, winstate->agg_ptr);
+		if (!tuplestore_gettupleslot(winstate->buffer, true, first_peer_slot))
+			elog(ERROR, "unexpected end of tuplestore");
+	}
+
+	/*
+	 * Advance until we reach the next non-peer row
+	 */
+	first = true;
+	for (;;)
+	{
+		if (!first)
+		{
+			/* Fetch the next row, and see if it's a peer */
+			spool_tuples(winstate, winstate->aggregatedupto);
+			tuplestore_select_read_pointer(winstate->buffer,
+										   winstate->agg_ptr);
+			slot = winstate->temp_slot_1;
+			if (!tuplestore_gettupleslot(winstate->buffer, true, slot))
+				break;
+			if (!are_peers(winstate, first_peer_slot, slot))
+			{
+				ExecCopySlot(first_peer_slot, slot);
+				break;
+			}
+		}
+		else
+		{
+			/*
+			 * On first iteration, just accumulate the tuple saved from
+			 * last call
+			 */
+			slot = first_peer_slot;
+			first = false;
+		}
+
+		/* set tuple context for evaluation of aggregate arguments */
+		winstate->tmpcontext->ecxt_outertuple = slot;
+
+		for (i = 0; i < numaggs; i++)
+		{
+			wfuncno = winstate->peragg[i].wfuncno;
+
+			advance_windowaggregate(winstate,
+									&winstate->perfunc[wfuncno],
+									&winstate->peragg[i]);
+
+		}
+		/* Reset per-input-tuple context after each tuple */
+		ResetExprContext(winstate->tmpcontext);
+		winstate->aggregatedupto++;
+	}
+
+	/*
+	 * finalize aggregates and fill result/isnull fields.
+	 */
+	for (i = 0; i < numaggs; i++)
+	{
+		Datum	   *result;
+		bool	   *isnull;
+
+		peraggstate = &winstate->peragg[i];
+		wfuncno = peraggstate->wfuncno;
+		result = &econtext->ecxt_aggvalues[wfuncno];
+		isnull = &econtext->ecxt_aggnulls[wfuncno];
+		finalize_windowaggregate(winstate,
+								 &winstate->perfunc[wfuncno],
+								 peraggstate, result, isnull);
+
+		/*
+		 * save the result for the next (non-shrinking frame) call.
+		 */
+		if (!peraggstate->resulttypeByVal && !*isnull)
+		{
+			/*
+			 * clear old resultValue in order not to leak memory.
+			 */
+			if (peraggstate->hasResult &&
+				(DatumGetPointer(peraggstate->resultValue) !=
+					DatumGetPointer(*result)) &&
+				!peraggstate->resultValueIsNull)
+				pfree(DatumGetPointer(peraggstate->resultValue));
+
+			/*
+			 * If pass-by-ref, copy it into our global context.
+			 */
+			oldContext = MemoryContextSwitchTo(winstate->wincontext);
+			peraggstate->resultValue = datumCopy(*result,
+												 peraggstate->resulttypeByVal,
+												 peraggstate->resulttypeLen);
+			MemoryContextSwitchTo(oldContext);
+		}
+		else
+		{
+			peraggstate->resultValue = *result;
+		}
+		peraggstate->resultValueIsNull = *isnull;
+		peraggstate->hasResult = true;
+	}
+}
+
+/*
+ * eval_windowfunction
+ *
+ * Arguments of window functions are not evaluated here, because a window
+ * function can need random access to arbitrary rows in the partition.
+ * The window function uses the special WinGetFuncArgInPartition and
+ * WinGetFuncArgInFrame functions to evaluate the arguments for the rows
+ * it wants.
+ */
+static void
+eval_windowfunction(WindowAggState *winstate, WindowStatePerFunc perfuncstate,
+					Datum *result, bool *isnull)
+{
+	FunctionCallInfoData fcinfo;
+	MemoryContext		oldContext;
+
+	oldContext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
+
+	/*
+	 * We don't pass any normal arguments to a window function, but we do
+	 * pass it the number of arguments, in order to permit window function
+	 * implementations to support varying numbers of arguments.  The real
+	 * info goes through the WindowObject, which is passed via fcinfo->context.
+	 */
+	InitFunctionCallInfoData(fcinfo, &(perfuncstate->flinfo),
+							 perfuncstate->numArguments,
+							 (void *) perfuncstate->winobj, NULL);
+	/* Just in case, make all the regular argument slots be null */
+	memset(fcinfo.argnull, true, perfuncstate->numArguments);
+
+	*result = FunctionCallInvoke(&fcinfo);
+	*isnull = fcinfo.isnull;
+
+	/*
+	 * Make sure pass-by-ref data is allocated in the appropriate context.
+	 * (We need this in case the function returns a pointer into some
+	 * short-lived tuple, as is entirely possible.)
+	 */
+	if (!perfuncstate->resulttypeByVal && !fcinfo.isnull &&
+		!MemoryContextContains(CurrentMemoryContext,
+							   DatumGetPointer(*result)))
+		*result = datumCopy(*result,
+							perfuncstate->resulttypeByVal,
+							perfuncstate->resulttypeLen);
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * begin_partition
+ * Start buffering rows of the next partition.
+ */
+static void
+begin_partition(WindowAggState *winstate)
+{
+	PlanState	   *outerPlan = outerPlanState(winstate);
+	int				numfuncs = winstate->numfuncs;
+	int				i;
+
+	winstate->partition_spooled = false;
+	winstate->spooled_rows = 0;
+	winstate->currentpos = 0;
+	winstate->frametailpos = -1;
+	winstate->aggregatedupto = 0;
+
+	/*
+	 * If this is the very first partition, we need to fetch the first
+	 * input row to store in it.
+	 */
+	if (TupIsNull(winstate->first_part_slot))
+	{
+		TupleTableSlot *outerslot = ExecProcNode(outerPlan);
+
+		if (!TupIsNull(outerslot))
+			 ExecCopySlot(winstate->first_part_slot, outerslot);
+		else
+		{
+			/* outer plan is empty, so we have nothing to do */
+			winstate->partition_spooled = true;
+			winstate->more_partitions = false;
+			return;
+		}
+	}
+
+	/* Create new tuplestore for this partition */
+	winstate->buffer = tuplestore_begin_heap(false, false, work_mem);
+
+	/*
+	 * Set up read pointers for the tuplestore.  The current and agg pointers
+	 * don't need BACKWARD capability, but the per-window-function read
+	 * pointers do.
+	 */
+	winstate->current_ptr = 0;	/* read pointer 0 is pre-allocated */
+
+	/* reset default REWIND capability bit for current ptr */
+	tuplestore_set_eflags(winstate->buffer, 0);
+
+	/* create a read pointer for aggregates, if needed */
+	if (winstate->numaggs > 0)
+		winstate->agg_ptr = tuplestore_alloc_read_pointer(winstate->buffer, 0);
+
+	/* create mark and read pointers for each real window function */
+	for (i = 0; i < numfuncs; i++)
+	{
+		WindowStatePerFunc	perfuncstate = &(winstate->perfunc[i]);
+
+		if (!perfuncstate->plain_agg)
+		{
+			WindowObject	winobj = perfuncstate->winobj;
+
+			winobj->markptr = tuplestore_alloc_read_pointer(winstate->buffer,
+															0);
+			winobj->readptr = tuplestore_alloc_read_pointer(winstate->buffer,
+															EXEC_FLAG_BACKWARD);
+			winobj->markpos = -1;
+			winobj->seekpos = -1;
+		}
+	}
+
+	/*
+	 * Store the first tuple into the tuplestore (it's always available now;
+	 * we either read it above, or saved it at the end of previous partition)
+	 */
+	tuplestore_puttupleslot(winstate->buffer, winstate->first_part_slot);
+	winstate->spooled_rows++;
+}
+
+/*
+ * Read tuples from the outer node, up to position 'pos', and store them
+ * into the tuplestore. If pos is -1, reads the whole partition.
+ */
+static void
+spool_tuples(WindowAggState *winstate, int64 pos)
+{
+	WindowAgg	   *node = (WindowAgg *) winstate->ss.ps.plan;
+	PlanState	   *outerPlan;
+	TupleTableSlot *outerslot;
+	MemoryContext oldcontext;
+
+	if (!winstate->buffer)
+		return;					/* just a safety check */
+	if (winstate->partition_spooled)
+		return;					/* whole partition done already */
+
+	/*
+	 * If the tuplestore has spilled to disk, alternate reading and writing
+	 * becomes quite expensive due to frequent buffer flushes.  It's cheaper
+	 * to force the entire partition to get spooled in one go.
+	 *
+	 * XXX this is a horrid kluge --- it'd be better to fix the performance
+	 * problem inside tuplestore.  FIXME
+	 */
+	if (!tuplestore_in_memory(winstate->buffer))
+		pos = -1;
+
+	outerPlan = outerPlanState(winstate);
+
+	/* Must be in query context to call outerplan or touch tuplestore */
+	oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory);
+
+	while (winstate->spooled_rows <= pos || pos == -1)
+	{
+		outerslot = ExecProcNode(outerPlan);
+		if (TupIsNull(outerslot))
+		{
+			/* reached the end of the last partition */
+			winstate->partition_spooled = true;
+			winstate->more_partitions = false;
+			break;
+		}
+
+		if (node->partNumCols > 0)
+		{
+			/* Check if this tuple still belongs to the current partition */
+			if (!execTuplesMatch(winstate->first_part_slot,
+								 outerslot,
+								 node->partNumCols, node->partColIdx,
+								 winstate->partEqfunctions,
+								 winstate->tmpcontext->ecxt_per_tuple_memory))
+			{
+				/*
+				 * end of partition; copy the tuple for the next cycle.
+				 */
+				ExecCopySlot(winstate->first_part_slot, outerslot);
+				winstate->partition_spooled = true;
+				winstate->more_partitions = true;
+				break;
+			}
+		}
+
+		/* Still in partition, so save it into the tuplestore */
+		tuplestore_puttupleslot(winstate->buffer, outerslot);
+		winstate->spooled_rows++;
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * release_partition
+ * clear information kept within a partition, including
+ * tuplestore and aggregate results.
+ */
+static void
+release_partition(WindowAggState *winstate)
+{
+	int					i;
+
+	for (i = 0; i < winstate->numfuncs; i++)
+	{
+		WindowStatePerFunc		perfuncstate = &(winstate->perfunc[i]);
+
+		/* Release any partition-local state of this window function */
+		if (perfuncstate->winobj)
+			perfuncstate->winobj->localmem = NULL;
+
+		/* Reset agg result cache */
+		if (perfuncstate->plain_agg)
+		{
+			int		aggno = perfuncstate->aggno;
+			WindowStatePerAggData *peraggstate = &winstate->peragg[aggno];
+
+			peraggstate->resultValueIsNull = true;
+			peraggstate->hasResult = false;
+		}
+	}
+
+	/*
+	 * Release all partition-local memory (in particular, any partition-local
+	 * state or aggregate temp data that we might have trashed our pointers
+	 * to in the above loop).  We don't rely on retail pfree because some
+	 * aggregates might have allocated data we don't have direct pointers to.
+	 */
+	MemoryContextResetAndDeleteChildren(winstate->wincontext);
+
+	/* Ensure eval_windowaggregates will see next call as partition start */
+	ExecClearTuple(winstate->first_peer_slot);
+
+	if (winstate->buffer)
+		tuplestore_end(winstate->buffer);
+	winstate->buffer = NULL;
+	winstate->partition_spooled = false;
+}
+
+
+/* -----------------
+ * ExecWindowAgg
+ *
+ *	ExecWindowAgg receives tuples from its outer subplan and
+ *	stores them into a tuplestore, then processes window functions.
+ *	This node doesn't reduce nor qualify any row so the number of
+ *	returned rows is exactly the same as its outer subplan's result
+ *	(ignoring the case of SRFs in the targetlist, that is).
+ * -----------------
+ */
+TupleTableSlot *
+ExecWindowAgg(WindowAggState *winstate)
+{
+	TupleTableSlot *result;
+	ExprDoneCond	isDone;
+	ExprContext	   *econtext;
+	int				i;
+	int				numfuncs;
+
+	if (winstate->all_done)
+		return NULL;
+
+	/*
+	 * Check to see if we're still projecting out tuples from a previous output
+	 * tuple (because there is a function-returning-set in the projection
+	 * expressions).  If so, try to project another one.
+	 */
+	if (winstate->ss.ps.ps_TupFromTlist)
+	{
+		TupleTableSlot *result;
+		ExprDoneCond isDone;
+
+		result = ExecProject(winstate->ss.ps.ps_ProjInfo, &isDone);
+		if (isDone == ExprMultipleResult)
+			return result;
+		/* Done with that source tuple... */
+		winstate->ss.ps.ps_TupFromTlist = false;
+	}
+
+restart:
+	if (winstate->buffer == NULL)
+	{
+		/* Initialize for first partition and set current row = 0 */
+		begin_partition(winstate);
+	}
+	else
+	{
+		/* Advance current row within partition */
+		winstate->currentpos++;
+	}
+
+	/*
+	 * Spool all tuples up to and including the current row, if we haven't
+	 * already
+	 */
+	spool_tuples(winstate, winstate->currentpos);
+
+	/* Move to the next partition if we reached the end of this partition */
+	if (winstate->partition_spooled &&
+		winstate->currentpos >= winstate->spooled_rows)
+	{
+		release_partition(winstate);
+
+		if (winstate->more_partitions)
+		{
+			begin_partition(winstate);
+			Assert(winstate->spooled_rows > 0);
+		}
+		else
+		{
+			winstate->all_done = true;
+			return NULL;
+		}
+	}
+
+	/* final output execution is in ps_ExprContext */
+	econtext = winstate->ss.ps.ps_ExprContext;
+
+	/* Clear the per-output-tuple context for current row */
+	ResetExprContext(econtext);
+
+	/*
+	 * Read the current row from the tuplestore, and save in ScanTupleSlot
+	 * for possible use by WinGetFuncArgCurrent or the final projection step.
+	 * (We can't rely on the outerplan's output slot because we may have to
+	 * read beyond the current row.)
+	 *
+	 * Current row must be in the tuplestore, since we spooled it above.
+	 */
+	tuplestore_select_read_pointer(winstate->buffer, winstate->current_ptr);
+	if (!tuplestore_gettupleslot(winstate->buffer, true,
+								 winstate->ss.ss_ScanTupleSlot))
+		elog(ERROR, "unexpected end of tuplestore");
+
+	/*
+	 * Evaluate true window functions
+	 */
+	numfuncs = winstate->numfuncs;
+	for (i = 0; i < numfuncs; i++)
+	{
+		WindowStatePerFunc	perfuncstate = &(winstate->perfunc[i]);
+
+		if (perfuncstate->plain_agg)
+			continue;
+		eval_windowfunction(winstate, perfuncstate,
+							&(econtext->ecxt_aggvalues[perfuncstate->wfuncstate->wfuncno]),
+							&(econtext->ecxt_aggnulls[perfuncstate->wfuncstate->wfuncno]));
+	}
+
+	/*
+	 * Evaluate aggregates
+	 */
+	if (winstate->numaggs > 0)
+		eval_windowaggregates(winstate);
+
+	/*
+	 * Truncate any no-longer-needed rows from the tuplestore.
+	 */
+	tuplestore_trim(winstate->buffer);
+
+	/*
+	 * Form and return a projection tuple using the windowfunc results
+	 * and the current row.  Setting ecxt_outertuple arranges that any
+	 * Vars will be evaluated with respect to that row.
+	 */
+	econtext->ecxt_outertuple = winstate->ss.ss_ScanTupleSlot;
+	result = ExecProject(winstate->ss.ps.ps_ProjInfo, &isDone);
+
+	if (isDone == ExprEndResult)
+	{
+		/* SRF in tlist returned no rows, so advance to next input tuple */
+		goto restart;
+	}
+
+	winstate->ss.ps.ps_TupFromTlist =
+		(isDone == ExprMultipleResult);
+	return result;
+}
+
+/* -----------------
+ * ExecInitWindowAgg
+ *
+ *	Creates the run-time information for the WindowAgg node produced by the
+ *	planner and initializes its outer subtree
+ * -----------------
+ */
+WindowAggState *
+ExecInitWindowAgg(WindowAgg *node, EState *estate, int eflags)
+{
+	WindowAggState *winstate;
+	Plan	   *outerPlan;
+	ExprContext *econtext;
+	ExprContext *tmpcontext;
+	WindowStatePerFunc  perfunc;
+	WindowStatePerAgg   peragg;
+	int			numfuncs,
+				wfuncno,
+				numaggs,
+				aggno;
+	ListCell   *l;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * create state structure
+	 */
+	winstate = makeNode(WindowAggState);
+	winstate->ss.ps.plan = (Plan *) node;
+	winstate->ss.ps.state = estate;
+
+	/*
+	 * Create expression contexts.	We need two, one for per-input-tuple
+	 * processing and one for per-output-tuple processing.	We cheat a little
+	 * by using ExecAssignExprContext() to build both.
+	 */
+	ExecAssignExprContext(estate, &winstate->ss.ps);
+	tmpcontext = winstate->ss.ps.ps_ExprContext;
+	winstate->tmpcontext = tmpcontext;
+	ExecAssignExprContext(estate, &winstate->ss.ps);
+
+	/* Create long-lived context for storage of aggregate transvalues etc */
+	winstate->wincontext =
+		AllocSetContextCreate(CurrentMemoryContext,
+							  "WindowAggContext",
+							  ALLOCSET_DEFAULT_MINSIZE,
+							  ALLOCSET_DEFAULT_INITSIZE,
+							  ALLOCSET_DEFAULT_MAXSIZE);
+
+#define WINDOWAGG_NSLOTS 6
+
+	/*
+	 * tuple table initialization
+	 */
+	ExecInitScanTupleSlot(estate, &winstate->ss);
+	ExecInitResultTupleSlot(estate, &winstate->ss.ps);
+	winstate->first_part_slot = ExecInitExtraTupleSlot(estate);
+	winstate->first_peer_slot = ExecInitExtraTupleSlot(estate);
+	winstate->temp_slot_1 = ExecInitExtraTupleSlot(estate);
+	winstate->temp_slot_2 = ExecInitExtraTupleSlot(estate);
+
+	winstate->ss.ps.targetlist = (List *)
+		ExecInitExpr((Expr *) node->plan.targetlist,
+					 (PlanState *) winstate);
+
+	/*
+	 * WindowAgg nodes never have quals, since they can only occur at the
+	 * logical top level of a query (ie, after any WHERE or HAVING filters)
+	 */
+	Assert(node->plan.qual == NIL);
+	winstate->ss.ps.qual = NIL;
+
+	/*
+	 * initialize child nodes
+	 */
+	outerPlan = outerPlan(node);
+	outerPlanState(winstate) = ExecInitNode(outerPlan, estate, eflags);
+
+	/*
+	 * initialize source tuple type (which is also the tuple type that we'll
+	 * store in the tuplestore and use in all our working slots).
+	 */
+	ExecAssignScanTypeFromOuterPlan(&winstate->ss);
+
+	ExecSetSlotDescriptor(winstate->first_part_slot,
+						  winstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor);
+	ExecSetSlotDescriptor(winstate->first_peer_slot,
+						  winstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor);
+	ExecSetSlotDescriptor(winstate->temp_slot_1,
+						  winstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor);
+	ExecSetSlotDescriptor(winstate->temp_slot_2,
+						  winstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor);
+
+	/*
+	 * Initialize result tuple type and projection info.
+	 */
+	ExecAssignResultTypeFromTL(&winstate->ss.ps);
+	ExecAssignProjectionInfo(&winstate->ss.ps, NULL);
+
+	winstate->ss.ps.ps_TupFromTlist = false;
+
+	/* Set up data for comparing tuples */
+	if (node->partNumCols > 0)
+		winstate->partEqfunctions = execTuplesMatchPrepare(node->partNumCols,
+														  node->partOperators);
+	if (node->ordNumCols > 0)
+		winstate->ordEqfunctions = execTuplesMatchPrepare(node->ordNumCols,
+														  node->ordOperators);
+
+	/*
+	 * WindowAgg nodes use aggvalues and aggnulls as well as Agg nodes.
+	 */
+	numfuncs = winstate->numfuncs;
+	numaggs = winstate->numaggs;
+	econtext = winstate->ss.ps.ps_ExprContext;
+	econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numfuncs);
+	econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numfuncs);
+
+	/*
+	 * allocate per-wfunc/per-agg state information.
+	 */
+	perfunc = (WindowStatePerFunc) palloc0(sizeof(WindowStatePerFuncData) * numfuncs);
+	peragg = (WindowStatePerAgg) palloc0(sizeof(WindowStatePerAggData) * numaggs);
+	winstate->perfunc = perfunc;
+	winstate->peragg = peragg;
+
+	wfuncno = -1;
+	aggno = -1;
+	foreach(l, winstate->funcs)
+	{
+		WindowFuncExprState	   *wfuncstate = (WindowFuncExprState *) lfirst(l);
+		WindowFunc			   *wfunc = (WindowFunc *) wfuncstate->xprstate.expr;
+		WindowStatePerFunc perfuncstate;
+		AclResult	aclresult;
+		int			i;
+
+		/* Look for a previous duplicate window function */
+		for (i = 0; i <= wfuncno; i++)
+		{
+			if (equal(wfunc, perfunc[i].wfunc) &&
+				!contain_volatile_functions((Node *) wfunc))
+				break;
+		}
+		if (i <= wfuncno)
+		{
+			/* Found a match to an existing entry, so just mark it */
+			wfuncstate->wfuncno = i;
+			continue;
+		}
+
+		/* Nope, so assign a new PerAgg record */
+		perfuncstate = &perfunc[++wfuncno];
+
+		/* Mark WindowFunc state node with assigned index in the result array */
+		wfuncstate->wfuncno = wfuncno;
+
+		/* Check permission to call window function */
+		aclresult = pg_proc_aclcheck(wfunc->winfnoid, GetUserId(),
+									 ACL_EXECUTE);
+		if (aclresult != ACLCHECK_OK)
+			aclcheck_error(aclresult, ACL_KIND_PROC,
+						   get_func_name(wfunc->winfnoid));
+
+		/* Fill in the perfuncstate data */
+		perfuncstate->wfuncstate = wfuncstate;
+		perfuncstate->wfunc = wfunc;
+		perfuncstate->numArguments = list_length(wfuncstate->args);
+
+		fmgr_info_cxt(wfunc->winfnoid, &perfuncstate->flinfo,
+					  tmpcontext->ecxt_per_query_memory);
+		perfuncstate->flinfo.fn_expr = (Node *) wfunc;
+		get_typlenbyval(wfunc->wintype,
+						&perfuncstate->resulttypeLen,
+						&perfuncstate->resulttypeByVal);
+
+		/*
+		 * If it's really just a plain aggregate function,
+		 * we'll emulate the Agg environment for it.
+		 */
+		perfuncstate->plain_agg = wfunc->winagg;
+		if (wfunc->winagg)
+		{
+			WindowStatePerAgg	peraggstate;
+
+			perfuncstate->aggno = ++aggno;
+			peraggstate = &winstate->peragg[aggno];
+			initialize_peragg(winstate, wfunc, peraggstate);
+			peraggstate->wfuncno = wfuncno;
+		}
+		else
+		{
+			WindowObject winobj = makeNode(WindowObjectData);
+
+			winobj->winstate = winstate;
+			winobj->argstates = wfuncstate->args;
+			winobj->localmem = NULL;
+			perfuncstate->winobj = winobj;
+		}
+	}
+
+	/* Update numfuncs, numaggs to match number of unique functions found */
+	winstate->numfuncs = wfuncno + 1;
+	winstate->numaggs = aggno + 1;
+
+	winstate->partition_spooled = false;
+	winstate->more_partitions = false;
+
+	return winstate;
+}
+
+/* -----------------
+ * ExecCountSlotsWindowAgg
+ * -----------------
+ */
+int
+ExecCountSlotsWindowAgg(WindowAgg *node)
+{
+	return ExecCountSlotsNode(outerPlan(node)) +
+		ExecCountSlotsNode(innerPlan(node)) +
+		WINDOWAGG_NSLOTS;
+}
+
+/* -----------------
+ * ExecEndWindowAgg
+ * -----------------
+ */
+void
+ExecEndWindowAgg(WindowAggState *node)
+{
+	PlanState  *outerPlan;
+
+	release_partition(node);
+
+	pfree(node->perfunc);
+	pfree(node->peragg);
+
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+	ExecClearTuple(node->first_part_slot);
+	ExecClearTuple(node->first_peer_slot);
+	ExecClearTuple(node->temp_slot_1);
+	ExecClearTuple(node->temp_slot_2);
+
+	/*
+	 * Free both the expr contexts.
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+	node->ss.ps.ps_ExprContext = node->tmpcontext;
+	ExecFreeExprContext(&node->ss.ps);
+
+	MemoryContextDelete(node->wincontext);
+
+	outerPlan = outerPlanState(node);
+	ExecEndNode(outerPlan);
+}
+
+/* -----------------
+ * ExecRescanWindowAgg
+ * -----------------
+ */
+void
+ExecReScanWindowAgg(WindowAggState *node, ExprContext *exprCtxt)
+{
+	ExprContext	   *econtext = node->ss.ps.ps_ExprContext;
+
+	node->all_done = false;
+
+	node->ss.ps.ps_TupFromTlist = false;
+
+	/* release tuplestore et al */
+	release_partition(node);
+
+	/* release all temp tuples, but especially first_part_slot */
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+	ExecClearTuple(node->first_part_slot);
+	ExecClearTuple(node->first_peer_slot);
+	ExecClearTuple(node->temp_slot_1);
+	ExecClearTuple(node->temp_slot_2);
+
+	/* Forget current wfunc values */
+	MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * node->numfuncs);
+	MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * node->numfuncs);
+
+	/*
+	 * if chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (((PlanState *) node)->lefttree->chgParam == NULL)
+		ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
+}
+
+/*
+ * initialize_peragg
+ *
+ * Almost same as in nodeAgg.c, except we don't support DISTINCT currently.
+ */
+static WindowStatePerAggData *
+initialize_peragg(WindowAggState *winstate, WindowFunc *wfunc,
+				  WindowStatePerAgg peraggstate)
+{
+	Oid			inputTypes[FUNC_MAX_ARGS];
+	int			numArguments;
+	HeapTuple	aggTuple;
+	Form_pg_aggregate aggform;
+	Oid			aggtranstype;
+	AclResult	aclresult;
+	Oid			transfn_oid,
+				finalfn_oid;
+	Expr	   *transfnexpr,
+			   *finalfnexpr;
+	Datum		textInitVal;
+	int			i;
+	ListCell   *lc;
+
+	numArguments = list_length(wfunc->args);
+
+	i = 0;
+	foreach(lc, wfunc->args)
+	{
+		inputTypes[i++] = exprType((Node *) lfirst(lc));
+	}
+
+	aggTuple = SearchSysCache(AGGFNOID,
+							  ObjectIdGetDatum(wfunc->winfnoid),
+							  0, 0, 0);
+	if (!HeapTupleIsValid(aggTuple))
+		elog(ERROR, "cache lookup failed for aggregate %u",
+			 wfunc->winfnoid);
+	aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
+
+	/*
+	 * ExecInitWindowAgg already checked permission to call aggregate function
+	 * ... but we still need to check the component functions
+	 */
+
+	peraggstate->transfn_oid = transfn_oid = aggform->aggtransfn;
+	peraggstate->finalfn_oid = finalfn_oid = aggform->aggfinalfn;
+
+	/* Check that aggregate owner has permission to call component fns */
+	{
+		HeapTuple	procTuple;
+		Oid			aggOwner;
+
+		procTuple = SearchSysCache(PROCOID,
+								   ObjectIdGetDatum(wfunc->winfnoid),
+								   0, 0, 0);
+		if (!HeapTupleIsValid(procTuple))
+			elog(ERROR, "cache lookup failed for function %u",
+				 wfunc->winfnoid);
+		aggOwner = ((Form_pg_proc) GETSTRUCT(procTuple))->proowner;
+		ReleaseSysCache(procTuple);
+
+		aclresult = pg_proc_aclcheck(transfn_oid, aggOwner,
+									 ACL_EXECUTE);
+		if (aclresult != ACLCHECK_OK)
+			aclcheck_error(aclresult, ACL_KIND_PROC,
+						   get_func_name(transfn_oid));
+		if (OidIsValid(finalfn_oid))
+		{
+			aclresult = pg_proc_aclcheck(finalfn_oid, aggOwner,
+										 ACL_EXECUTE);
+			if (aclresult != ACLCHECK_OK)
+				aclcheck_error(aclresult, ACL_KIND_PROC,
+							   get_func_name(finalfn_oid));
+		}
+	}
+
+	/* resolve actual type of transition state, if polymorphic */
+	aggtranstype = aggform->aggtranstype;
+	if (IsPolymorphicType(aggtranstype))
+	{
+		/* have to fetch the agg's declared input types... */
+		Oid		   *declaredArgTypes;
+		int			agg_nargs;
+
+		get_func_signature(wfunc->winfnoid,
+						   &declaredArgTypes, &agg_nargs);
+		Assert(agg_nargs == numArguments);
+		aggtranstype = enforce_generic_type_consistency(inputTypes,
+														declaredArgTypes,
+														agg_nargs,
+														aggtranstype,
+														false);
+		pfree(declaredArgTypes);
+	}
+
+	/* build expression trees using actual argument & result types */
+	build_aggregate_fnexprs(inputTypes,
+							numArguments,
+							aggtranstype,
+							wfunc->wintype,
+							transfn_oid,
+							finalfn_oid,
+							&transfnexpr,
+							&finalfnexpr);
+
+	fmgr_info(transfn_oid, &peraggstate->transfn);
+	peraggstate->transfn.fn_expr = (Node *) transfnexpr;
+
+	if (OidIsValid(finalfn_oid))
+	{
+		fmgr_info(finalfn_oid, &peraggstate->finalfn);
+		peraggstate->finalfn.fn_expr = (Node *) finalfnexpr;
+	}
+
+	get_typlenbyval(wfunc->wintype,
+					&peraggstate->resulttypeLen,
+					&peraggstate->resulttypeByVal);
+	get_typlenbyval(aggtranstype,
+					&peraggstate->transtypeLen,
+					&peraggstate->transtypeByVal);
+
+	/*
+	 * initval is potentially null, so don't try to access it as a struct
+	 * field. Must do it the hard way with SysCacheGetAttr.
+	 */
+	textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple,
+								  Anum_pg_aggregate_agginitval,
+								  &peraggstate->initValueIsNull);
+
+	if (peraggstate->initValueIsNull)
+		peraggstate->initValue = (Datum) 0;
+	else
+		peraggstate->initValue = GetAggInitVal(textInitVal,
+											   aggtranstype);
+
+	/*
+	 * If the transfn is strict and the initval is NULL, make sure input
+	 * type and transtype are the same (or at least binary-compatible), so
+	 * that it's OK to use the first input value as the initial
+	 * transValue.	This should have been checked at agg definition time,
+	 * but just in case...
+	 */
+	if (peraggstate->transfn.fn_strict && peraggstate->initValueIsNull)
+	{
+		if (numArguments < 1 ||
+			!IsBinaryCoercible(inputTypes[0], aggtranstype))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+					 errmsg("aggregate %u needs to have compatible input type and transition type",
+							wfunc->winfnoid)));
+	}
+
+	ReleaseSysCache(aggTuple);
+
+	return peraggstate;
+}
+
+static Datum
+GetAggInitVal(Datum textInitVal, Oid transtype)
+{
+	Oid			typinput,
+				typioparam;
+	char	   *strInitVal;
+	Datum		initVal;
+
+	getTypeInputInfo(transtype, &typinput, &typioparam);
+	strInitVal = TextDatumGetCString(textInitVal);
+	initVal = OidInputFunctionCall(typinput, strInitVal,
+								   typioparam, -1);
+	pfree(strInitVal);
+	return initVal;
+}
+
+/*
+ * are_peers
+ * compare two rows to see if they are equal according to the ORDER BY clause
+ */
+static bool
+are_peers(WindowAggState *winstate, TupleTableSlot *slot1,
+		  TupleTableSlot *slot2)
+{
+	WindowAgg  *node = (WindowAgg *) winstate->ss.ps.plan;
+
+	/* If no ORDER BY, all rows are peers with each other */
+	if (node->ordNumCols == 0)
+		return true;
+
+	return execTuplesMatch(slot1, slot2,
+						   node->ordNumCols, node->ordColIdx,
+						   winstate->ordEqfunctions,
+						   winstate->tmpcontext->ecxt_per_tuple_memory);
+}
+
+/*
+ * window_gettupleslot
+ *	Fetch the pos'th tuple of the current partition into the slot
+ *
+ * Returns true if successful, false if no such row
+ */
+static bool
+window_gettupleslot(WindowObject winobj, int64 pos, TupleTableSlot *slot)
+{
+	WindowAggState *winstate = winobj->winstate;
+	MemoryContext oldcontext;
+
+	/* Don't allow passing -1 to spool_tuples here */
+	if (pos < 0)
+		return false;
+
+	/* If necessary, fetch the tuple into the spool */
+	spool_tuples(winstate, pos);
+
+	if (pos >= winstate->spooled_rows)
+		return false;
+
+	if (pos < winobj->markpos)
+		elog(ERROR, "cannot fetch row before WindowObject's mark position");
+
+	oldcontext = MemoryContextSwitchTo(winstate->ss.ps.ps_ExprContext->ecxt_per_query_memory);
+
+	tuplestore_select_read_pointer(winstate->buffer, winobj->readptr);
+
+	/*
+	 * There's no API to refetch the tuple at the current position. We
+	 * have to move one tuple forward, and then one backward.  (We don't
+	 * do it the other way because we might try to fetch the row before
+	 * our mark, which isn't allowed.)
+	 */
+	if (winobj->seekpos == pos)
+	{
+		tuplestore_advance(winstate->buffer, true);
+		winobj->seekpos++;
+	}
+
+	while (winobj->seekpos > pos)
+	{
+		if (!tuplestore_gettupleslot(winstate->buffer, false, slot))
+			elog(ERROR, "unexpected end of tuplestore");
+		winobj->seekpos--;
+	}
+
+	while (winobj->seekpos < pos)
+	{
+		if (!tuplestore_gettupleslot(winstate->buffer, true, slot))
+			elog(ERROR, "unexpected end of tuplestore");
+		winobj->seekpos++;
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return true;
+}
+
+
+/***********************************************************************
+ * API exposed to window functions
+ ***********************************************************************/
+
+
+/*
+ * WinGetPartitionLocalMemory
+ *		Get working memory that lives till end of partition processing
+ *
+ * On first call within a given partition, this allocates and zeroes the
+ * requested amount of space.  Subsequent calls just return the same chunk.
+ *
+ * Memory obtained this way is normally used to hold state that should be
+ * automatically reset for each new partition.  If a window function wants
+ * to hold state across the whole query, fcinfo->fn_extra can be used in the
+ * usual way for that.
+ */
+void *
+WinGetPartitionLocalMemory(WindowObject winobj, Size sz)
+{
+	Assert(WindowObjectIsValid(winobj));
+	if (winobj->localmem == NULL)
+		winobj->localmem = MemoryContextAllocZero(winobj->winstate->wincontext,
+												  sz);
+	return winobj->localmem;
+}
+
+/*
+ * WinGetCurrentPosition
+ *		Return the current row's position (counting from 0) within the current
+ *		partition.
+ */
+int64
+WinGetCurrentPosition(WindowObject winobj)
+{
+	Assert(WindowObjectIsValid(winobj));
+	return winobj->winstate->currentpos;
+}
+
+/*
+ * WinGetPartitionRowCount
+ *		Return total number of rows contained in the current partition.
+ *
+ * Note: this is a relatively expensive operation because it forces the
+ * whole partition to be "spooled" into the tuplestore at once.  Once
+ * executed, however, additional calls within the same partition are cheap.
+ */
+int64
+WinGetPartitionRowCount(WindowObject winobj)
+{
+	Assert(WindowObjectIsValid(winobj));
+	spool_tuples(winobj->winstate, -1);
+	return winobj->winstate->spooled_rows;
+}
+
+/*
+ * WinSetMarkPosition
+ *		Set the "mark" position for the window object, which is the oldest row
+ *		number (counting from 0) it is allowed to fetch during all subsequent
+ *		operations within the current partition.
+ *
+ * Window functions do not have to call this, but are encouraged to move the
+ * mark forward when possible to keep the tuplestore size down and prevent
+ * having to spill rows to disk.
+ */
+void
+WinSetMarkPosition(WindowObject winobj, int64 markpos)
+{
+	WindowAggState *winstate;
+
+	Assert(WindowObjectIsValid(winobj));
+	winstate = winobj->winstate;
+
+	if (markpos < winobj->markpos)
+		elog(ERROR, "cannot move WindowObject's mark position backward");
+	tuplestore_select_read_pointer(winstate->buffer, winobj->markptr);
+	while (markpos > winobj->markpos)
+	{
+		tuplestore_advance(winstate->buffer, true);
+		winobj->markpos++;
+	}
+	tuplestore_select_read_pointer(winstate->buffer, winobj->readptr);
+	while (markpos > winobj->seekpos)
+	{
+		tuplestore_advance(winstate->buffer, true);
+		winobj->seekpos++;
+	}
+}
+
+/*
+ * WinRowsArePeers
+ *		Compare two rows (specified by absolute position in window) to see
+ *		if they are equal according to the ORDER BY clause.
+ */
+bool
+WinRowsArePeers(WindowObject winobj, int64 pos1, int64 pos2)
+{
+	WindowAggState *winstate;
+	WindowAgg	   *node;
+	TupleTableSlot *slot1;
+	TupleTableSlot *slot2;
+	bool			res;
+
+	Assert(WindowObjectIsValid(winobj));
+
+	winstate = winobj->winstate;
+	node = (WindowAgg *) winstate->ss.ps.plan;
+
+	/* If no ORDER BY, all rows are peers; don't bother to fetch them */
+	if (node->ordNumCols == 0)
+		return true;
+
+	slot1 = winstate->temp_slot_1;
+	slot2 = winstate->temp_slot_2;
+
+	if (!window_gettupleslot(winobj, pos1, slot1))
+		elog(ERROR, "specified position is out of window: " INT64_FORMAT,
+			 pos1);
+	if (!window_gettupleslot(winobj, pos2, slot2))
+		elog(ERROR, "specified position is out of window: " INT64_FORMAT,
+			 pos2);
+
+	res = are_peers(winstate, slot1, slot2);
+
+	ExecClearTuple(slot1);
+	ExecClearTuple(slot2);
+
+	return res;
+}
+
+/*
+ * WinGetFuncArgInPartition
+ *		Evaluate a window function's argument expression on a specified
+ *		row of the partition.  The row is identified in lseek(2) style,
+ *		i.e. relative to the current, first, or last row.
+ *
+ * argno: argument number to evaluate (counted from 0)
+ * relpos: signed rowcount offset from the seek position
+ * seektype: WINDOW_SEEK_CURRENT, WINDOW_SEEK_HEAD, or WINDOW_SEEK_TAIL
+ * set_mark: If the row is found and set_mark is true, the mark is moved to
+ *		the row as a side-effect.
+ * isnull: output argument, receives isnull status of result
+ * isout: output argument, set to indicate whether target row position
+ *		is out of partition (can pass NULL if caller doesn't care about this)
+ *
+ * Specifying a nonexistent row is not an error, it just causes a null result
+ * (plus setting *isout true, if isout isn't NULL).
+ */
+Datum
+WinGetFuncArgInPartition(WindowObject winobj, int argno,
+						 int relpos, int seektype, bool set_mark,
+						 bool *isnull, bool *isout)
+{
+	ExprContext *econtext;
+	TupleTableSlot *slot;
+	bool		gottuple;
+	int64		abs_pos;
+
+	Assert(WindowObjectIsValid(winobj));
+
+	econtext = winobj->winstate->ss.ps.ps_ExprContext;
+	slot = winobj->winstate->temp_slot_1;
+
+	switch (seektype)
+	{
+		case WINDOW_SEEK_CURRENT:
+			abs_pos = winobj->winstate->currentpos + relpos;
+			break;
+		case WINDOW_SEEK_HEAD:
+			abs_pos = relpos;
+			break;
+		case WINDOW_SEEK_TAIL:
+			spool_tuples(winobj->winstate, -1);
+			abs_pos = winobj->winstate->spooled_rows - 1 + relpos;
+			break;
+		default:
+			elog(ERROR, "unrecognized window seek type: %d", seektype);
+			abs_pos = 0; /* keep compiler quiet */
+			break;
+	}
+
+	if (abs_pos >= 0)
+		gottuple = window_gettupleslot(winobj, abs_pos, slot);
+	else
+		gottuple = false;
+
+	if (!gottuple)
+	{
+		if (isout)
+			*isout = true;
+		*isnull = true;
+		return (Datum) 0;
+	}
+	else
+	{
+		if (isout)
+			*isout = false;
+		if (set_mark)
+			WinSetMarkPosition(winobj, abs_pos);
+		econtext->ecxt_outertuple = slot;
+		return ExecEvalExpr((ExprState *) list_nth(winobj->argstates, argno),
+							econtext, isnull, NULL);
+	}
+}
+
+/*
+ * WinGetFuncArgInFrame
+ *		Evaluate a window function's argument expression on a specified
+ *		row of the window frame.  The row is identified in lseek(2) style,
+ *		i.e. relative to the current, first, or last row.
+ *
+ * argno: argument number to evaluate (counted from 0)
+ * relpos: signed rowcount offset from the seek position
+ * seektype: WINDOW_SEEK_CURRENT, WINDOW_SEEK_HEAD, or WINDOW_SEEK_TAIL
+ * set_mark: If the row is found and set_mark is true, the mark is moved to
+ *		the row as a side-effect.
+ * isnull: output argument, receives isnull status of result
+ * isout: output argument, set to indicate whether target row position
+ *		is out of frame (can pass NULL if caller doesn't care about this)
+ *
+ * Specifying a nonexistent row is not an error, it just causes a null result
+ * (plus setting *isout true, if isout isn't NULL).
+ */
+Datum
+WinGetFuncArgInFrame(WindowObject winobj, int argno,
+					 int relpos, int seektype, bool set_mark,
+					 bool *isnull, bool *isout)
+{
+	ExprContext *econtext;
+	TupleTableSlot *slot;
+	bool		gottuple;
+	int64		abs_pos;
+	int64		frametailpos;
+
+	Assert(WindowObjectIsValid(winobj));
+
+	/* if no ordering columns, partition and frame are the same thing */
+	if (((WindowAgg *) winobj->winstate->ss.ps.plan)->ordNumCols == 0)
+		return WinGetFuncArgInPartition(winobj, argno, relpos, seektype,
+										set_mark, isnull, isout);
+
+	econtext = winobj->winstate->ss.ps.ps_ExprContext;
+	slot = winobj->winstate->temp_slot_1;
+	frametailpos = winobj->winstate->frametailpos;
+
+	switch (seektype)
+	{
+		case WINDOW_SEEK_CURRENT:
+			abs_pos = winobj->winstate->currentpos + relpos;
+			break;
+		case WINDOW_SEEK_HEAD:
+			abs_pos = relpos;
+			break;
+		case WINDOW_SEEK_TAIL:
+			/* abs_pos is calculated later */
+			abs_pos = 0; /* keep compiler quiet */
+			break;
+		default:
+			elog(ERROR, "unrecognized window seek type: %d", seektype);
+			abs_pos = 0; /* keep compiler quiet */
+			break;
+	}
+
+	/*
+	 * Seek for frame tail. If the tail position is before current,
+	 * always check if the tail is after the current or not.
+	 */
+	if (frametailpos <= winobj->winstate->currentpos)
+	{
+		int64 add = 1;
+
+		for (;;)
+		{
+			spool_tuples(winobj->winstate, winobj->winstate->currentpos + add);
+			if (winobj->winstate->spooled_rows > winobj->winstate->currentpos + add)
+			{
+				/*
+				 * When seektype is not TAIL, we may optimize not to
+				 * spool unnecessary tuples. In TAIL mode, we need to search
+				 * until we find a row that's definitely not a peer.
+				 */
+				if (!WinRowsArePeers(winobj, winobj->winstate->currentpos,
+									 winobj->winstate->currentpos + add) ||
+					(seektype != WINDOW_SEEK_TAIL &&
+					 winobj->winstate->currentpos + add < abs_pos))
+					break;
+				add++;
+			}
+			else
+			{
+				/*
+				 * If hit the partition end, the last row is the frame tail.
+				 */
+				break;
+			}
+		}
+		frametailpos = winobj->winstate->currentpos + add - 1;
+		winobj->winstate->frametailpos = frametailpos;
+	}
+
+	if (seektype == WINDOW_SEEK_TAIL)
+	{
+		abs_pos = frametailpos + relpos;
+	}
+
+	/*
+	 * If there is an ORDER BY (we don't support other window frame
+	 * specifications yet), the frame runs from first row of the partition
+	 * to the last peer of the current row. Otherwise the frame is the
+	 * whole partition.
+	 */
+	if (abs_pos < 0 || abs_pos > frametailpos)
+		gottuple = false;
+	else
+		gottuple = window_gettupleslot(winobj, abs_pos, slot);
+
+	if (!gottuple)
+	{
+		if (isout)
+			*isout = true;
+		*isnull = true;
+		return (Datum) 0;
+	}
+	else
+	{
+		if (isout)
+			*isout = false;
+		if (set_mark)
+			WinSetMarkPosition(winobj, abs_pos);
+		econtext->ecxt_outertuple = slot;
+		return ExecEvalExpr((ExprState *) list_nth(winobj->argstates, argno),
+							econtext, isnull, NULL);
+	}
+}
+
+/*
+ * WinGetFuncArgCurrent
+ *		Evaluate a window function's argument expression on the current row.
+ *
+ * argno: argument number to evaluate (counted from 0)
+ * isnull: output argument, receives isnull status of result
+ *
+ * Note: this isn't quite equivalent to WinGetFuncArgInPartition or
+ * WinGetFuncArgInFrame targeting the current row, because it will succeed
+ * even if the WindowObject's mark has been set beyond the current row.
+ * This should generally be used for "ordinary" arguments of a window
+ * function, such as the offset argument of lead() or lag().
+ */
+Datum
+WinGetFuncArgCurrent(WindowObject winobj, int argno, bool *isnull)
+{
+	WindowAggState *winstate;
+	ExprContext *econtext;
+
+	Assert(WindowObjectIsValid(winobj));
+	winstate = winobj->winstate;
+
+	econtext = winstate->ss.ps.ps_ExprContext;
+
+	econtext->ecxt_outertuple = winstate->ss.ss_ScanTupleSlot;
+	return ExecEvalExpr((ExprState *) list_nth(winobj->argstates, argno),
+						econtext, isnull, NULL);
+}
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 86f555a03a6..412fd96e5bf 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -15,7 +15,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.416 2008/12/19 16:25:17 petere Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.417 2008/12/28 18:53:55 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -669,6 +669,32 @@ _copyAgg(Agg *from)
 }
 
 /*
+ * _copyWindowAgg
+ */
+static WindowAgg *
+_copyWindowAgg(WindowAgg *from)
+{
+	WindowAgg  *newnode = makeNode(WindowAgg);
+
+	CopyPlanFields((Plan *) from, (Plan *) newnode);
+
+	COPY_SCALAR_FIELD(partNumCols);
+	if (from->partNumCols > 0)
+	{
+		COPY_POINTER_FIELD(partColIdx, from->partNumCols * sizeof(AttrNumber));
+		COPY_POINTER_FIELD(partOperators, from->partNumCols * sizeof(Oid));
+	}
+	COPY_SCALAR_FIELD(ordNumCols);
+	if (from->ordNumCols > 0)
+	{
+		COPY_POINTER_FIELD(ordColIdx, from->ordNumCols * sizeof(AttrNumber));
+		COPY_POINTER_FIELD(ordOperators, from->ordNumCols * sizeof(Oid));
+	}
+
+	return newnode;
+}
+
+/*
  * _copyUnique
  */
 static Unique *
@@ -932,6 +958,25 @@ _copyAggref(Aggref *from)
 }
 
 /*
+ * _copyWindowFunc
+ */
+static WindowFunc *
+_copyWindowFunc(WindowFunc *from)
+{
+	WindowFunc *newnode = makeNode(WindowFunc);
+
+	COPY_SCALAR_FIELD(winfnoid);
+	COPY_SCALAR_FIELD(wintype);
+	COPY_NODE_FIELD(args);
+	COPY_SCALAR_FIELD(winref);
+	COPY_SCALAR_FIELD(winstar);
+	COPY_SCALAR_FIELD(winagg);
+	COPY_LOCATION_FIELD(location);
+
+	return newnode;
+}
+
+/*
  * _copyArrayRef
  */
 static ArrayRef *
@@ -1729,6 +1774,21 @@ _copySortGroupClause(SortGroupClause *from)
 	return newnode;
 }
 
+static WindowClause *
+_copyWindowClause(WindowClause *from)
+{
+	WindowClause *newnode = makeNode(WindowClause);
+
+	COPY_STRING_FIELD(name);
+	COPY_STRING_FIELD(refname);
+	COPY_NODE_FIELD(partitionClause);
+	COPY_NODE_FIELD(orderClause);
+	COPY_SCALAR_FIELD(winref);
+	COPY_SCALAR_FIELD(copiedOrder);
+
+	return newnode;
+}
+
 static RowMarkClause *
 _copyRowMarkClause(RowMarkClause *from)
 {
@@ -1850,6 +1910,7 @@ _copyFuncCall(FuncCall *from)
 	COPY_SCALAR_FIELD(agg_star);
 	COPY_SCALAR_FIELD(agg_distinct);
 	COPY_SCALAR_FIELD(func_variadic);
+	COPY_NODE_FIELD(over);
 	COPY_LOCATION_FIELD(location);
 
 	return newnode;
@@ -1940,6 +2001,20 @@ _copySortBy(SortBy *from)
 	return newnode;
 }
 
+static WindowDef *
+_copyWindowDef(WindowDef *from)
+{
+	WindowDef  *newnode = makeNode(WindowDef);
+
+	COPY_STRING_FIELD(name);
+	COPY_STRING_FIELD(refname);
+	COPY_NODE_FIELD(partitionClause);
+	COPY_NODE_FIELD(orderClause);
+	COPY_LOCATION_FIELD(location);
+
+	return newnode;
+}
+
 static RangeSubselect *
 _copyRangeSubselect(RangeSubselect *from)
 {
@@ -2081,6 +2156,7 @@ _copyQuery(Query *from)
 	COPY_SCALAR_FIELD(resultRelation);
 	COPY_NODE_FIELD(intoClause);
 	COPY_SCALAR_FIELD(hasAggs);
+	COPY_SCALAR_FIELD(hasWindowFuncs);
 	COPY_SCALAR_FIELD(hasSubLinks);
 	COPY_SCALAR_FIELD(hasDistinctOn);
 	COPY_SCALAR_FIELD(hasRecursive);
@@ -2091,6 +2167,7 @@ _copyQuery(Query *from)
 	COPY_NODE_FIELD(returningList);
 	COPY_NODE_FIELD(groupClause);
 	COPY_NODE_FIELD(havingQual);
+	COPY_NODE_FIELD(windowClause);
 	COPY_NODE_FIELD(distinctClause);
 	COPY_NODE_FIELD(sortClause);
 	COPY_NODE_FIELD(limitOffset);
@@ -2153,6 +2230,7 @@ _copySelectStmt(SelectStmt *from)
 	COPY_NODE_FIELD(whereClause);
 	COPY_NODE_FIELD(groupClause);
 	COPY_NODE_FIELD(havingClause);
+	COPY_NODE_FIELD(windowClause);
 	COPY_NODE_FIELD(withClause);
 	COPY_NODE_FIELD(valuesLists);
 	COPY_NODE_FIELD(sortClause);
@@ -3440,6 +3518,9 @@ copyObject(void *from)
 		case T_Agg:
 			retval = _copyAgg(from);
 			break;
+		case T_WindowAgg:
+			retval = _copyWindowAgg(from);
+			break;
 		case T_Unique:
 			retval = _copyUnique(from);
 			break;
@@ -3480,6 +3561,9 @@ copyObject(void *from)
 		case T_Aggref:
 			retval = _copyAggref(from);
 			break;
+		case T_WindowFunc:
+			retval = _copyWindowFunc(from);
+			break;
 		case T_ArrayRef:
 			retval = _copyArrayRef(from);
 			break;
@@ -3951,6 +4035,9 @@ copyObject(void *from)
 		case T_SortBy:
 			retval = _copySortBy(from);
 			break;
+		case T_WindowDef:
+			retval = _copyWindowDef(from);
+			break;
 		case T_RangeSubselect:
 			retval = _copyRangeSubselect(from);
 			break;
@@ -3984,6 +4071,9 @@ copyObject(void *from)
 		case T_SortGroupClause:
 			retval = _copySortGroupClause(from);
 			break;
+		case T_WindowClause:
+			retval = _copyWindowClause(from);
+			break;
 		case T_RowMarkClause:
 			retval = _copyRowMarkClause(from);
 			break;
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index e5e2bc44226..e96c66152e8 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -22,7 +22,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/equalfuncs.c,v 1.341 2008/12/19 16:25:17 petere Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/equalfuncs.c,v 1.342 2008/12/28 18:53:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -192,6 +192,20 @@ _equalAggref(Aggref *a, Aggref *b)
 }
 
 static bool
+_equalWindowFunc(WindowFunc *a, WindowFunc *b)
+{
+	COMPARE_SCALAR_FIELD(winfnoid);
+	COMPARE_SCALAR_FIELD(wintype);
+	COMPARE_NODE_FIELD(args);
+	COMPARE_SCALAR_FIELD(winref);
+	COMPARE_SCALAR_FIELD(winstar);
+	COMPARE_SCALAR_FIELD(winagg);
+	COMPARE_LOCATION_FIELD(location);
+
+	return true;
+}
+
+static bool
 _equalArrayRef(ArrayRef *a, ArrayRef *b)
 {
 	COMPARE_SCALAR_FIELD(refarraytype);
@@ -839,6 +853,7 @@ _equalQuery(Query *a, Query *b)
 	COMPARE_SCALAR_FIELD(resultRelation);
 	COMPARE_NODE_FIELD(intoClause);
 	COMPARE_SCALAR_FIELD(hasAggs);
+	COMPARE_SCALAR_FIELD(hasWindowFuncs);
 	COMPARE_SCALAR_FIELD(hasSubLinks);
 	COMPARE_SCALAR_FIELD(hasDistinctOn);
 	COMPARE_SCALAR_FIELD(hasRecursive);
@@ -849,6 +864,7 @@ _equalQuery(Query *a, Query *b)
 	COMPARE_NODE_FIELD(returningList);
 	COMPARE_NODE_FIELD(groupClause);
 	COMPARE_NODE_FIELD(havingQual);
+	COMPARE_NODE_FIELD(windowClause);
 	COMPARE_NODE_FIELD(distinctClause);
 	COMPARE_NODE_FIELD(sortClause);
 	COMPARE_NODE_FIELD(limitOffset);
@@ -903,6 +919,7 @@ _equalSelectStmt(SelectStmt *a, SelectStmt *b)
 	COMPARE_NODE_FIELD(whereClause);
 	COMPARE_NODE_FIELD(groupClause);
 	COMPARE_NODE_FIELD(havingClause);
+	COMPARE_NODE_FIELD(windowClause);
 	COMPARE_NODE_FIELD(withClause);
 	COMPARE_NODE_FIELD(valuesLists);
 	COMPARE_NODE_FIELD(sortClause);
@@ -1894,6 +1911,7 @@ _equalFuncCall(FuncCall *a, FuncCall *b)
 	COMPARE_SCALAR_FIELD(agg_star);
 	COMPARE_SCALAR_FIELD(agg_distinct);
 	COMPARE_SCALAR_FIELD(func_variadic);
+	COMPARE_NODE_FIELD(over);
 	COMPARE_LOCATION_FIELD(location);
 
 	return true;
@@ -1981,6 +1999,18 @@ _equalSortBy(SortBy *a, SortBy *b)
 }
 
 static bool
+_equalWindowDef(WindowDef *a, WindowDef *b)
+{
+	COMPARE_STRING_FIELD(name);
+	COMPARE_STRING_FIELD(refname);
+	COMPARE_NODE_FIELD(partitionClause);
+	COMPARE_NODE_FIELD(orderClause);
+	COMPARE_LOCATION_FIELD(location);
+
+	return true;
+}
+
+static bool
 _equalRangeSubselect(RangeSubselect *a, RangeSubselect *b)
 {
 	COMPARE_NODE_FIELD(subquery);
@@ -2107,6 +2137,19 @@ _equalSortGroupClause(SortGroupClause *a, SortGroupClause *b)
 }
 
 static bool
+_equalWindowClause(WindowClause *a, WindowClause *b)
+{
+	COMPARE_STRING_FIELD(name);
+	COMPARE_STRING_FIELD(refname);
+	COMPARE_NODE_FIELD(partitionClause);
+	COMPARE_NODE_FIELD(orderClause);
+	COMPARE_SCALAR_FIELD(winref);
+	COMPARE_SCALAR_FIELD(copiedOrder);
+
+	return true;
+}
+
+static bool
 _equalRowMarkClause(RowMarkClause *a, RowMarkClause *b)
 {
 	COMPARE_SCALAR_FIELD(rti);
@@ -2311,6 +2354,9 @@ equal(void *a, void *b)
 		case T_Aggref:
 			retval = _equalAggref(a, b);
 			break;
+		case T_WindowFunc:
+			retval = _equalWindowFunc(a, b);
+			break;
 		case T_ArrayRef:
 			retval = _equalArrayRef(a, b);
 			break;
@@ -2769,6 +2815,9 @@ equal(void *a, void *b)
 		case T_SortBy:
 			retval = _equalSortBy(a, b);
 			break;
+		case T_WindowDef:
+			retval = _equalWindowDef(a, b);
+			break;
 		case T_RangeSubselect:
 			retval = _equalRangeSubselect(a, b);
 			break;
@@ -2802,6 +2851,9 @@ equal(void *a, void *b)
 		case T_SortGroupClause:
 			retval = _equalSortGroupClause(a, b);
 			break;
+		case T_WindowClause:
+			retval = _equalWindowClause(a, b);
+			break;
 		case T_RowMarkClause:
 			retval = _equalRowMarkClause(a, b);
 			break;
diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c
index 7236360347c..0284ce4edca 100644
--- a/src/backend/nodes/nodeFuncs.c
+++ b/src/backend/nodes/nodeFuncs.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/nodeFuncs.c,v 1.35 2008/10/21 20:42:52 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/nodeFuncs.c,v 1.36 2008/12/28 18:53:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -52,6 +52,9 @@ exprType(Node *expr)
 		case T_Aggref:
 			type = ((Aggref *) expr)->aggtype;
 			break;
+		case T_WindowFunc:
+			type = ((WindowFunc *) expr)->wintype;
+			break;
 		case T_ArrayRef:
 			{
 				ArrayRef   *arrayref = (ArrayRef *) expr;
@@ -548,6 +551,8 @@ expression_returns_set_walker(Node *node, void *context)
 	/* Avoid recursion for some cases that can't return a set */
 	if (IsA(node, Aggref))
 		return false;
+	if (IsA(node, WindowFunc))
+		return false;
 	if (IsA(node, DistinctExpr))
 		return false;
 	if (IsA(node, ScalarArrayOpExpr))
@@ -634,6 +639,10 @@ exprLocation(Node *expr)
 			/* function name should always be the first thing */
 			loc = ((Aggref *) expr)->location;
 			break;
+		case T_WindowFunc:
+			/* function name should always be the first thing */
+			loc = ((WindowFunc *) expr)->location;
+			break;
 		case T_ArrayRef:
 			/* just use array argument's location */
 			loc = exprLocation((Node *) ((ArrayRef *) expr)->refexpr);
@@ -868,6 +877,9 @@ exprLocation(Node *expr)
 			/* just use argument's location (ignore operator, if any) */
 			loc = exprLocation(((SortBy *) expr)->node);
 			break;
+		case T_WindowDef:
+			loc = ((WindowDef *) expr)->location;
+			break;
 		case T_TypeName:
 			loc = ((TypeName *) expr)->location;
 			break;
@@ -1045,6 +1057,16 @@ expression_tree_walker(Node *node,
 					return true;
 			}
 			break;
+		case T_WindowFunc:
+			{
+				WindowFunc *expr = (WindowFunc *) node;
+
+				/* recurse directly on List */
+				if (expression_tree_walker((Node *) expr->args,
+										   walker, context))
+					return true;
+			}
+			break;
 		case T_ArrayRef:
 			{
 				ArrayRef   *aref = (ArrayRef *) node;
@@ -1221,6 +1243,16 @@ expression_tree_walker(Node *node,
 		case T_Query:
 			/* Do nothing with a sub-Query, per discussion above */
 			break;
+		case T_WindowClause:
+			{
+				WindowClause    *wc = (WindowClause *) node;
+
+				if (walker(wc->partitionClause, context))
+					return true;
+				if (walker(wc->orderClause, context))
+					return true;
+			}
+			break;
 		case T_CommonTableExpr:
 			{
 				CommonTableExpr *cte = (CommonTableExpr *) node;
@@ -1539,6 +1571,16 @@ expression_tree_mutator(Node *node,
 				return (Node *) newnode;
 			}
 			break;
+		case T_WindowFunc:
+			{
+				WindowFunc *wfunc = (WindowFunc *) node;
+				WindowFunc *newnode;
+
+				FLATCOPY(newnode, wfunc, WindowFunc);
+				MUTATE(newnode->args, wfunc->args, List *);
+				return (Node *) newnode;
+			}
+			break;
 		case T_ArrayRef:
 			{
 				ArrayRef   *arrayref = (ArrayRef *) node;
@@ -1848,6 +1890,17 @@ expression_tree_mutator(Node *node,
 		case T_Query:
 			/* Do nothing with a sub-Query, per discussion above */
 			return node;
+		case T_WindowClause:
+			{
+				WindowClause    *wc = (WindowClause *) node;
+				WindowClause    *newnode;
+
+				FLATCOPY(newnode, wc, WindowClause);
+				MUTATE(newnode->partitionClause, wc->partitionClause, List *);
+				MUTATE(newnode->orderClause, wc->orderClause, List *);
+				return (Node *) newnode;
+			}
+			break;
 		case T_CommonTableExpr:
 			{
 				CommonTableExpr *cte = (CommonTableExpr *) node;
@@ -2280,6 +2333,8 @@ raw_expression_tree_walker(Node *node, bool (*walker) (), void *context)
 					return true;
 				if (walker(stmt->havingClause, context))
 					return true;
+				if (walker(stmt->windowClause, context))
+					return true;
 				if (walker(stmt->withClause, context))
 					return true;
 				if (walker(stmt->valuesLists, context))
@@ -2318,6 +2373,8 @@ raw_expression_tree_walker(Node *node, bool (*walker) (), void *context)
 
 				if (walker(fcall->args, context))
 					return true;
+				if (walker(fcall->over, context))
+					return true;
 				/* function name is deemed uninteresting */
 			}
 			break;
@@ -2365,6 +2422,16 @@ raw_expression_tree_walker(Node *node, bool (*walker) (), void *context)
 			break;
 		case T_SortBy:
 			return walker(((SortBy *) node)->node, context);
+		case T_WindowDef:
+			{
+				WindowDef *wd = (WindowDef *) node;
+
+				if (walker(wd->partitionClause, context))
+					return true;
+				if (walker(wd->orderClause, context))
+					return true;
+			}
+			break;
 		case T_RangeSubselect:
 			{
 				RangeSubselect *rs = (RangeSubselect *) node;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 2477a17cfa3..f926f1314cd 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.346 2008/12/01 21:06:12 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.347 2008/12/28 18:53:56 tgl Exp $
  *
  * NOTES
  *	  Every node type that can appear in stored rules' parsetrees *must*
@@ -567,6 +567,36 @@ _outAgg(StringInfo str, Agg *node)
 }
 
 static void
+_outWindowAgg(StringInfo str, WindowAgg *node)
+{
+	int			i;
+
+	WRITE_NODE_TYPE("WINDOWAGG");
+
+	_outPlanInfo(str, (Plan *) node);
+
+	WRITE_INT_FIELD(partNumCols);
+
+	appendStringInfo(str, " :partColIdx");
+	for (i = 0; i < node->partNumCols; i++)
+		appendStringInfo(str, " %d", node->partColIdx[i]);
+
+	appendStringInfo(str, " :partOperations");
+	for (i = 0; i < node->partNumCols; i++)
+		appendStringInfo(str, " %u", node->partOperators[i]);
+
+	WRITE_INT_FIELD(ordNumCols);
+
+	appendStringInfo(str, " :ordColIdx");
+	for (i = 0; i< node->ordNumCols; i++)
+		appendStringInfo(str, " %d", node->ordColIdx[i]);
+
+	appendStringInfo(str, " :ordOperations");
+	for (i = 0; i < node->ordNumCols; i++)
+		appendStringInfo(str, " %u", node->ordOperators[i]);
+}
+
+static void
 _outGroup(StringInfo str, Group *node)
 {
 	int			i;
@@ -799,6 +829,20 @@ _outAggref(StringInfo str, Aggref *node)
 }
 
 static void
+_outWindowFunc(StringInfo str, WindowFunc *node)
+{
+	WRITE_NODE_TYPE("WINDOWFUNC");
+
+	WRITE_OID_FIELD(winfnoid);
+	WRITE_OID_FIELD(wintype);
+	WRITE_NODE_FIELD(args);
+	WRITE_UINT_FIELD(winref);
+	WRITE_BOOL_FIELD(winstar);
+	WRITE_BOOL_FIELD(winagg);
+	WRITE_LOCATION_FIELD(location);
+}
+
+static void
 _outArrayRef(StringInfo str, ArrayRef *node)
 {
 	WRITE_NODE_TYPE("ARRAYREF");
@@ -1440,6 +1484,7 @@ _outPlannerInfo(StringInfo str, PlannerInfo *node)
 	WRITE_NODE_FIELD(placeholder_list);
 	WRITE_NODE_FIELD(query_pathkeys);
 	WRITE_NODE_FIELD(group_pathkeys);
+	WRITE_NODE_FIELD(window_pathkeys);
 	WRITE_NODE_FIELD(distinct_pathkeys);
 	WRITE_NODE_FIELD(sort_pathkeys);
 	WRITE_FLOAT_FIELD(total_table_pages, "%.0f");
@@ -1722,6 +1767,7 @@ _outSelectStmt(StringInfo str, SelectStmt *node)
 	WRITE_NODE_FIELD(whereClause);
 	WRITE_NODE_FIELD(groupClause);
 	WRITE_NODE_FIELD(havingClause);
+	WRITE_NODE_FIELD(windowClause);
 	WRITE_NODE_FIELD(withClause);
 	WRITE_NODE_FIELD(valuesLists);
 	WRITE_NODE_FIELD(sortClause);
@@ -1744,6 +1790,7 @@ _outFuncCall(StringInfo str, FuncCall *node)
 	WRITE_BOOL_FIELD(agg_star);
 	WRITE_BOOL_FIELD(agg_distinct);
 	WRITE_BOOL_FIELD(func_variadic);
+	WRITE_NODE_FIELD(over);
 	WRITE_LOCATION_FIELD(location);
 }
 
@@ -1866,6 +1913,7 @@ _outQuery(StringInfo str, Query *node)
 	WRITE_INT_FIELD(resultRelation);
 	WRITE_NODE_FIELD(intoClause);
 	WRITE_BOOL_FIELD(hasAggs);
+	WRITE_BOOL_FIELD(hasWindowFuncs);
 	WRITE_BOOL_FIELD(hasSubLinks);
 	WRITE_BOOL_FIELD(hasDistinctOn);
 	WRITE_BOOL_FIELD(hasRecursive);
@@ -1876,6 +1924,7 @@ _outQuery(StringInfo str, Query *node)
 	WRITE_NODE_FIELD(returningList);
 	WRITE_NODE_FIELD(groupClause);
 	WRITE_NODE_FIELD(havingQual);
+	WRITE_NODE_FIELD(windowClause);
 	WRITE_NODE_FIELD(distinctClause);
 	WRITE_NODE_FIELD(sortClause);
 	WRITE_NODE_FIELD(limitOffset);
@@ -1896,6 +1945,19 @@ _outSortGroupClause(StringInfo str, SortGroupClause *node)
 }
 
 static void
+_outWindowClause(StringInfo str, WindowClause *node)
+{
+	WRITE_NODE_TYPE("WINDOWCLAUSE");
+
+	WRITE_STRING_FIELD(name);
+	WRITE_STRING_FIELD(refname);
+	WRITE_NODE_FIELD(partitionClause);
+	WRITE_NODE_FIELD(orderClause);
+	WRITE_UINT_FIELD(winref);
+	WRITE_BOOL_FIELD(copiedOrder);
+}
+
+static void
 _outRowMarkClause(StringInfo str, RowMarkClause *node)
 {
 	WRITE_NODE_TYPE("ROWMARKCLAUSE");
@@ -2172,6 +2234,18 @@ _outSortBy(StringInfo str, SortBy *node)
 }
 
 static void
+_outWindowDef(StringInfo str, WindowDef *node)
+{
+	WRITE_NODE_TYPE("WINDOWDEF");
+
+	WRITE_STRING_FIELD(name);
+	WRITE_STRING_FIELD(refname);
+	WRITE_NODE_FIELD(partitionClause);
+	WRITE_NODE_FIELD(orderClause);
+	WRITE_LOCATION_FIELD(location);
+}
+
+static void
 _outRangeSubselect(StringInfo str, RangeSubselect *node)
 {
 	WRITE_NODE_TYPE("RANGESUBSELECT");
@@ -2347,6 +2421,9 @@ _outNode(StringInfo str, void *obj)
 			case T_Agg:
 				_outAgg(str, obj);
 				break;
+			case T_WindowAgg:
+				_outWindowAgg(str, obj);
+				break;
 			case T_Group:
 				_outGroup(str, obj);
 				break;
@@ -2392,6 +2469,9 @@ _outNode(StringInfo str, void *obj)
 			case T_Aggref:
 				_outAggref(str, obj);
 				break;
+			case T_WindowFunc:
+				_outWindowFunc(str, obj);
+				break;
 			case T_ArrayRef:
 				_outArrayRef(str, obj);
 				break;
@@ -2616,6 +2696,9 @@ _outNode(StringInfo str, void *obj)
 			case T_SortGroupClause:
 				_outSortGroupClause(str, obj);
 				break;
+			case T_WindowClause:
+				_outWindowClause(str, obj);
+				break;
 			case T_RowMarkClause:
 				_outRowMarkClause(str, obj);
 				break;
@@ -2661,6 +2744,9 @@ _outNode(StringInfo str, void *obj)
 			case T_SortBy:
 				_outSortBy(str, obj);
 				break;
+			case T_WindowDef:
+				_outWindowDef(str, obj);
+				break;
 			case T_RangeSubselect:
 				_outRangeSubselect(str, obj);
 				break;
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index ed5b55fb571..7bcc8e8047d 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/readfuncs.c,v 1.217 2008/11/15 19:43:46 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/readfuncs.c,v 1.218 2008/12/28 18:53:56 tgl Exp $
  *
  * NOTES
  *	  Path and Plan nodes do not have any readfuncs support, because we
@@ -153,6 +153,7 @@ _readQuery(void)
 	READ_INT_FIELD(resultRelation);
 	READ_NODE_FIELD(intoClause);
 	READ_BOOL_FIELD(hasAggs);
+	READ_BOOL_FIELD(hasWindowFuncs);
 	READ_BOOL_FIELD(hasSubLinks);
 	READ_BOOL_FIELD(hasDistinctOn);
 	READ_BOOL_FIELD(hasRecursive);
@@ -163,6 +164,7 @@ _readQuery(void)
 	READ_NODE_FIELD(returningList);
 	READ_NODE_FIELD(groupClause);
 	READ_NODE_FIELD(havingQual);
+	READ_NODE_FIELD(windowClause);
 	READ_NODE_FIELD(distinctClause);
 	READ_NODE_FIELD(sortClause);
 	READ_NODE_FIELD(limitOffset);
@@ -218,6 +220,24 @@ _readSortGroupClause(void)
 }
 
 /*
+ * _readWindowClause
+ */
+static WindowClause *
+_readWindowClause(void)
+{
+	READ_LOCALS(WindowClause);
+
+	READ_STRING_FIELD(name);
+	READ_STRING_FIELD(refname);
+	READ_NODE_FIELD(partitionClause);
+	READ_NODE_FIELD(orderClause);
+	READ_UINT_FIELD(winref);
+	READ_BOOL_FIELD(copiedOrder);
+
+	READ_DONE();
+}
+
+/*
  * _readRowMarkClause
  */
 static RowMarkClause *
@@ -403,6 +423,25 @@ _readAggref(void)
 }
 
 /*
+ * _readWindowFunc
+ */
+static WindowFunc *
+_readWindowFunc(void)
+{
+	READ_LOCALS(WindowFunc);
+
+	READ_OID_FIELD(winfnoid);
+	READ_OID_FIELD(wintype);
+	READ_NODE_FIELD(args);
+	READ_UINT_FIELD(winref);
+	READ_BOOL_FIELD(winstar);
+	READ_BOOL_FIELD(winagg);
+	READ_LOCATION_FIELD(location);
+
+	READ_DONE();
+}
+
+/*
  * _readArrayRef
  */
 static ArrayRef *
@@ -1091,6 +1130,8 @@ parseNodeString(void)
 		return_value = _readQuery();
 	else if (MATCH("SORTGROUPCLAUSE", 15))
 		return_value = _readSortGroupClause();
+	else if (MATCH("WINDOWCLAUSE", 12))
+		return_value = _readWindowClause();
 	else if (MATCH("ROWMARKCLAUSE", 13))
 		return_value = _readRowMarkClause();
 	else if (MATCH("COMMONTABLEEXPR", 15))
@@ -1111,6 +1152,8 @@ parseNodeString(void)
 		return_value = _readParam();
 	else if (MATCH("AGGREF", 6))
 		return_value = _readAggref();
+	else if (MATCH("WINDOWFUNC", 10))
+		return_value = _readWindowFunc();
 	else if (MATCH("ARRAYREF", 8))
 		return_value = _readArrayRef();
 	else if (MATCH("FUNCEXPR", 8))
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index b0553894c24..17eebc67647 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/allpaths.c,v 1.177 2008/11/15 19:43:46 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/allpaths.c,v 1.178 2008/12/28 18:53:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -929,10 +929,13 @@ standard_join_search(PlannerInfo *root, int levels_needed, List *initial_rels)
  * 1. If the subquery has a LIMIT clause, we must not push down any quals,
  * since that could change the set of rows returned.
  *
- * 2. If the subquery contains EXCEPT or EXCEPT ALL set ops we cannot push
+ * 2. If the subquery contains any window functions, we can't push quals
+ * into it, because that would change the results.
+ *
+ * 3. If the subquery contains EXCEPT or EXCEPT ALL set ops we cannot push
  * quals into it, because that would change the results.
  *
- * 3. For subqueries using UNION/UNION ALL/INTERSECT/INTERSECT ALL, we can
+ * 4. For subqueries using UNION/UNION ALL/INTERSECT/INTERSECT ALL, we can
  * push quals into each component query, but the quals can only reference
  * subquery columns that suffer no type coercions in the set operation.
  * Otherwise there are possible semantic gotchas.  So, we check the
@@ -950,6 +953,10 @@ subquery_is_pushdown_safe(Query *subquery, Query *topquery,
 	if (subquery->limitOffset != NULL || subquery->limitCount != NULL)
 		return false;
 
+	/* Check point 2 */
+	if (subquery->hasWindowFuncs)
+		return false;
+
 	/* Are we at top level, or looking at a setop component? */
 	if (subquery == topquery)
 	{
@@ -1093,6 +1100,12 @@ qual_is_pushdown_safe(Query *subquery, Index rti, Node *qual,
 		return false;
 
 	/*
+	 * It would be unsafe to push down window function calls, but at least
+	 * for the moment we could never see any in a qual anyhow.
+	 */
+	Assert(!contain_window_function(qual));
+
+	/*
 	 * Examine all Vars used in clause; since it's a restriction clause, all
 	 * such Vars must refer to subselect output columns.
 	 */
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 0b9c5819820..7f30dde869f 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -54,7 +54,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.201 2008/11/22 22:47:05 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.202 2008/12/28 18:53:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1284,6 +1284,40 @@ cost_agg(Path *path, PlannerInfo *root,
 }
 
 /*
+ * cost_windowagg
+ *		Determines and returns the cost of performing a WindowAgg plan node,
+ *		including the cost of its input.
+ *
+ * Input is assumed already properly sorted.
+ */
+void
+cost_windowagg(Path *path, PlannerInfo *root,
+			   int numWindowFuncs, int numPartCols, int numOrderCols,
+			   Cost input_startup_cost, Cost input_total_cost,
+			   double input_tuples)
+{
+	Cost		startup_cost;
+	Cost		total_cost;
+
+	startup_cost = input_startup_cost;
+	total_cost = input_total_cost;
+
+	/*
+	 * We charge one cpu_operator_cost per window function per tuple (often a
+	 * drastic underestimate, but without a way to gauge how many tuples the
+	 * window function will fetch, it's hard to do better).  We also charge
+	 * cpu_operator_cost per grouping column per tuple for grouping
+	 * comparisons, plus cpu_tuple_cost per tuple for general overhead.
+	 */
+	total_cost += cpu_operator_cost * input_tuples * numWindowFuncs;
+	total_cost += cpu_operator_cost * input_tuples * (numPartCols + numOrderCols);
+	total_cost += cpu_tuple_cost * input_tuples;
+
+	path->startup_cost = startup_cost;
+	path->total_cost = total_cost;
+}
+
+/*
  * cost_group
  *		Determines and returns the cost of performing a Group plan node,
  *		including the cost of its input.
@@ -2155,6 +2189,11 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
 	 * Vars and Consts are charged zero, and so are boolean operators (AND,
 	 * OR, NOT). Simplistic, but a lot better than no model at all.
 	 *
+	 * Note that Aggref and WindowFunc nodes are (and should be) treated
+	 * like Vars --- whatever execution cost they have is absorbed into
+	 * plan-node-specific costing.  As far as expression evaluation is
+	 * concerned they're just like Vars.
+	 *
 	 * Should we try to account for the possibility of short-circuit
 	 * evaluation of AND/OR?  Probably *not*, because that would make the
 	 * results depend on the clause ordering, and we are not in any position
diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c
index 3d35eb605d9..5f6d219a01a 100644
--- a/src/backend/optimizer/path/equivclass.c
+++ b/src/backend/optimizer/path/equivclass.c
@@ -10,7 +10,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/equivclass.c,v 1.14 2008/12/01 21:06:13 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/equivclass.c,v 1.15 2008/12/28 18:53:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -438,14 +438,16 @@ get_eclass_for_sort_expr(PlannerInfo *root,
 
 	/*
 	 * add_eq_member doesn't check for volatile functions, set-returning
-	 * functions, or aggregates, but such could appear in sort expressions; so
-	 * we have to check whether its const-marking was correct.
+	 * functions, aggregates, or window functions, but such could appear
+	 * in sort expressions; so we have to check whether its const-marking
+	 * was correct.
 	 */
 	if (newec->ec_has_const)
 	{
 		if (newec->ec_has_volatile ||
 			expression_returns_set((Node *) expr) ||
-			contain_agg_clause((Node *) expr))
+			contain_agg_clause((Node *) expr) ||
+			contain_window_function((Node *) expr))
 		{
 			newec->ec_has_const = false;
 			newem->em_is_const = false;
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index f5d4f41c032..b53b5e1470e 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.252 2008/11/20 19:52:54 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.253 2008/12/28 18:53:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -3237,8 +3237,8 @@ make_agg(PlannerInfo *root, List *tlist, List *qual,
 	 * anything for Aggref nodes; this is okay since they are really
 	 * comparable to Vars.
 	 *
-	 * See notes in grouping_planner about why this routine and make_group are
-	 * the only ones in this file that worry about tlist eval cost.
+	 * See notes in grouping_planner about why only make_agg, make_windowagg
+	 * and make_group worry about tlist eval cost.
 	 */
 	if (qual)
 	{
@@ -3260,6 +3260,53 @@ make_agg(PlannerInfo *root, List *tlist, List *qual,
 	return node;
 }
 
+WindowAgg *
+make_windowagg(PlannerInfo *root, List *tlist, int numWindowFuncs,
+			   int partNumCols, AttrNumber *partColIdx, Oid *partOperators,
+			   int ordNumCols, AttrNumber *ordColIdx, Oid *ordOperators,
+			   Plan *lefttree)
+{
+	WindowAgg  *node = makeNode(WindowAgg);
+	Plan	   *plan = &node->plan;
+	Path		windowagg_path;		/* dummy for result of cost_windowagg */
+	QualCost	qual_cost;
+
+	node->partNumCols = partNumCols;
+	node->partColIdx = partColIdx;
+	node->partOperators = partOperators;
+	node->ordNumCols = ordNumCols;
+	node->ordColIdx = ordColIdx;
+	node->ordOperators = ordOperators;
+
+	copy_plan_costsize(plan, lefttree);	/* only care about copying size */
+	cost_windowagg(&windowagg_path, root,
+				   numWindowFuncs, partNumCols, ordNumCols,
+				   lefttree->startup_cost,
+				   lefttree->total_cost,
+				   lefttree->plan_rows);
+	plan->startup_cost = windowagg_path.startup_cost;
+	plan->total_cost = windowagg_path.total_cost;
+
+	/*
+	 * We also need to account for the cost of evaluation of the tlist.
+	 *
+	 * See notes in grouping_planner about why only make_agg, make_windowagg
+	 * and make_group worry about tlist eval cost.
+	 */
+	cost_qual_eval(&qual_cost, tlist, root);
+	plan->startup_cost += qual_cost.startup;
+	plan->total_cost += qual_cost.startup;
+	plan->total_cost += qual_cost.per_tuple * plan->plan_rows;
+
+	plan->targetlist = tlist;
+	plan->lefttree = lefttree;
+	plan->righttree = NULL;
+	/* WindowAgg nodes never have a qual clause */
+	plan->qual = NIL;
+
+	return node;
+}
+
 Group *
 make_group(PlannerInfo *root,
 		   List *tlist,
@@ -3300,8 +3347,8 @@ make_group(PlannerInfo *root,
 	 * lower plan level and will only be copied by the Group node. Worth
 	 * fixing?
 	 *
-	 * See notes in grouping_planner about why this routine and make_agg are
-	 * the only ones in this file that worry about tlist eval cost.
+	 * See notes in grouping_planner about why only make_agg, make_windowagg
+	 * and make_group worry about tlist eval cost.
 	 */
 	if (qual)
 	{
diff --git a/src/backend/optimizer/plan/planagg.c b/src/backend/optimizer/plan/planagg.c
index 8a6b2ad0345..f0f17d5f950 100644
--- a/src/backend/optimizer/plan/planagg.c
+++ b/src/backend/optimizer/plan/planagg.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planagg.c,v 1.43 2008/08/25 22:42:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planagg.c,v 1.44 2008/12/28 18:53:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -95,11 +95,11 @@ optimize_minmax_aggregates(PlannerInfo *root, List *tlist, Path *best_path)
 	/*
 	 * Reject unoptimizable cases.
 	 *
-	 * We don't handle GROUP BY, because our current implementations of
-	 * grouping require looking at all the rows anyway, and so there's not
-	 * much point in optimizing MIN/MAX.
+	 * We don't handle GROUP BY or windowing, because our current
+	 * implementations of grouping require looking at all the rows anyway,
+	 * and so there's not much point in optimizing MIN/MAX.
 	 */
-	if (parse->groupClause)
+	if (parse->groupClause || parse->hasWindowFuncs)
 		return NULL;
 
 	/*
diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c
index 0a1d1d1559f..a8ea043a697 100644
--- a/src/backend/optimizer/plan/planmain.c
+++ b/src/backend/optimizer/plan/planmain.c
@@ -14,7 +14,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.112 2008/10/22 20:17:51 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.113 2008/12/28 18:53:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -67,9 +67,9 @@
  * PlannerInfo field and not a passed parameter is that the low-level routines
  * in indxpath.c need to see it.)
  *
- * Note: the PlannerInfo node also includes group_pathkeys, distinct_pathkeys,
- * and sort_pathkeys, which like query_pathkeys need to be canonicalized once
- * the info is available.
+ * Note: the PlannerInfo node also includes group_pathkeys, window_pathkeys,
+ * distinct_pathkeys, and sort_pathkeys, which like query_pathkeys need to be
+ * canonicalized once the info is available.
  *
  * tuple_fraction is interpreted as follows:
  *	  0: expect all tuples to be retrieved (normal case)
@@ -121,6 +121,8 @@ query_planner(PlannerInfo *root, List *tlist,
 													 root->query_pathkeys);
 		root->group_pathkeys = canonicalize_pathkeys(root,
 													 root->group_pathkeys);
+		root->window_pathkeys = canonicalize_pathkeys(root,
+													  root->window_pathkeys);
 		root->distinct_pathkeys = canonicalize_pathkeys(root,
 													root->distinct_pathkeys);
 		root->sort_pathkeys = canonicalize_pathkeys(root,
@@ -228,11 +230,12 @@ query_planner(PlannerInfo *root, List *tlist,
 	/*
 	 * We have completed merging equivalence sets, so it's now possible to
 	 * convert the requested query_pathkeys to canonical form.	Also
-	 * canonicalize the groupClause, distinctClause and sortClause pathkeys
-	 * for use later.
+	 * canonicalize the groupClause, windowClause, distinctClause and
+	 * sortClause pathkeys for use later.
 	 */
 	root->query_pathkeys = canonicalize_pathkeys(root, root->query_pathkeys);
 	root->group_pathkeys = canonicalize_pathkeys(root, root->group_pathkeys);
+	root->window_pathkeys = canonicalize_pathkeys(root, root->window_pathkeys);
 	root->distinct_pathkeys = canonicalize_pathkeys(root, root->distinct_pathkeys);
 	root->sort_pathkeys = canonicalize_pathkeys(root, root->sort_pathkeys);
 
@@ -287,10 +290,12 @@ query_planner(PlannerInfo *root, List *tlist,
 		 * If both GROUP BY and ORDER BY are specified, we will need two
 		 * levels of sort --- and, therefore, certainly need to read all the
 		 * tuples --- unless ORDER BY is a subset of GROUP BY.  Likewise if
-		 * we have both DISTINCT and GROUP BY.
+		 * we have both DISTINCT and GROUP BY, or if we have a window
+		 * specification not compatible with the GROUP BY.
 		 */
 		if (!pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys) ||
-			!pathkeys_contained_in(root->distinct_pathkeys, root->group_pathkeys))
+			!pathkeys_contained_in(root->distinct_pathkeys, root->group_pathkeys) ||
+			!pathkeys_contained_in(root->window_pathkeys, root->group_pathkeys))
 			tuple_fraction = 0.0;
 	}
 	else if (parse->hasAggs || root->hasHavingQual)
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 7f91309032a..b4b578d5973 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.247 2008/12/18 18:20:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.248 2008/12/28 18:53:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -82,6 +82,18 @@ static void locate_grouping_columns(PlannerInfo *root,
 						List *sub_tlist,
 						AttrNumber *groupColIdx);
 static List *postprocess_setop_tlist(List *new_tlist, List *orig_tlist);
+static List *select_active_windows(PlannerInfo *root, WindowFuncLists *wflists);
+static List *make_pathkeys_for_window(PlannerInfo *root, WindowClause *wc,
+									  List *tlist, bool canonicalize);
+static void get_column_info_for_window(PlannerInfo *root, WindowClause *wc,
+									   List *tlist,
+									   int numSortCols, AttrNumber *sortColIdx,
+									   int *partNumCols,
+									   AttrNumber **partColIdx,
+									   Oid **partOperators,
+									   int *ordNumCols,
+									   AttrNumber **ordColIdx,
+									   Oid **ordOperators);
 
 
 /*****************************************************************************
@@ -852,6 +864,8 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		AggClauseCounts agg_counts;
 		int			numGroupCols;
 		bool		use_hashed_grouping = false;
+		WindowFuncLists *wflists = NULL;
+		List	   *activeWindows = NIL;
 
 		MemSet(&agg_counts, 0, sizeof(AggClauseCounts));
 
@@ -867,6 +881,22 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		tlist = preprocess_targetlist(root, tlist);
 
 		/*
+		 * Locate any window functions in the tlist.  (We don't need to look
+		 * anywhere else, since expressions used in ORDER BY will be in there
+		 * too.)  Note that they could all have been eliminated by constant
+		 * folding, in which case we don't need to do any more work.
+		 */
+		if (parse->hasWindowFuncs)
+		{
+			wflists = find_window_functions((Node *) tlist,
+											list_length(parse->windowClause));
+			if (wflists->numWindowFuncs > 0)
+				activeWindows = select_active_windows(root, wflists);
+			else
+				parse->hasWindowFuncs = false;
+		}
+
+		/*
 		 * Generate appropriate target list for subplan; may be different from
 		 * tlist if grouping or aggregation is needed.
 		 */
@@ -890,6 +920,19 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		else
 			root->group_pathkeys = NIL;
 
+		/* We consider only the first (bottom) window in pathkeys logic */
+		if (activeWindows != NIL)
+		{
+			WindowClause *wc = (WindowClause *) linitial(activeWindows);
+
+			root->window_pathkeys = make_pathkeys_for_window(root,
+															 wc,
+															 tlist,
+															 false);
+		}
+		else
+			root->window_pathkeys = NIL;
+
 		if (parse->distinctClause &&
 			grouping_is_sortable(parse->distinctClause))
 			root->distinct_pathkeys =
@@ -927,11 +970,12 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		 * Figure out whether we want a sorted result from query_planner.
 		 *
 		 * If we have a sortable GROUP BY clause, then we want a result sorted
-		 * properly for grouping.  Otherwise, if there's a sortable DISTINCT
-		 * clause that's more rigorous than the ORDER BY clause, we try to
-		 * produce output that's sufficiently well sorted for the DISTINCT.
-		 * Otherwise, if there is an ORDER BY clause, we want to sort by the
-		 * ORDER BY clause.
+		 * properly for grouping.  Otherwise, if we have window functions to
+		 * evaluate, we try to sort for the first window.  Otherwise, if
+		 * there's a sortable DISTINCT clause that's more rigorous than the
+		 * ORDER BY clause, we try to produce output that's sufficiently well
+		 * sorted for the DISTINCT.  Otherwise, if there is an ORDER BY
+		 * clause, we want to sort by the ORDER BY clause.
 		 *
 		 * Note: if we have both ORDER BY and GROUP BY, and ORDER BY is a
 		 * superset of GROUP BY, it would be tempting to request sort by ORDER
@@ -942,6 +986,8 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		 */
 		if (root->group_pathkeys)
 			root->query_pathkeys = root->group_pathkeys;
+		else if (root->window_pathkeys)
+			root->query_pathkeys = root->window_pathkeys;
 		else if (list_length(root->distinct_pathkeys) >
 				 list_length(root->sort_pathkeys))
 			root->query_pathkeys = root->distinct_pathkeys;
@@ -1092,10 +1138,10 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 				 *
 				 * Below this point, any tlist eval cost for added-on nodes
 				 * should be accounted for as we create those nodes.
-				 * Presently, of the node types we can add on, only Agg and
-				 * Group project new tlists (the rest just copy their input
-				 * tuples) --- so make_agg() and make_group() are responsible
-				 * for computing the added cost.
+				 * Presently, of the node types we can add on, only Agg,
+				 * WindowAgg, and Group project new tlists (the rest just copy
+				 * their input tuples) --- so make_agg(), make_windowagg() and
+				 * make_group() are responsible for computing the added cost.
 				 */
 				cost_qual_eval(&tlist_cost, sub_tlist, root);
 				result_plan->startup_cost += tlist_cost.startup;
@@ -1225,6 +1271,142 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 												   NULL);
 			}
 		}						/* end of non-minmax-aggregate case */
+
+		/*
+		 * Since each window function could require a different sort order,
+		 * we stack up a WindowAgg node for each window, with sort steps
+		 * between them as needed.
+		 */
+		if (activeWindows)
+		{
+			List	   *window_tlist;
+			ListCell   *l;
+
+			/*
+			 * If the top-level plan node is one that cannot do expression
+			 * evaluation, we must insert a Result node to project the
+			 * desired tlist.  (In some cases this might not really be
+			 * required, but it's not worth trying to avoid it.)  Note that
+			 * on second and subsequent passes through the following loop,
+			 * the top-level node will be a WindowAgg which we know can
+			 * project; so we only need to check once.
+			 */
+			if (!is_projection_capable_plan(result_plan))
+			{
+				result_plan = (Plan *) make_result(root,
+												   NIL,
+												   NULL,
+												   result_plan);
+			}
+
+			/*
+			 * The "base" targetlist for all steps of the windowing process
+			 * is a flat tlist of all Vars and Aggs needed in the result.
+			 * (In some cases we wouldn't need to propagate all of these
+			 * all the way to the top, since they might only be needed as
+			 * inputs to WindowFuncs.  It's probably not worth trying to
+			 * optimize that though.)  As we climb up the stack, we add
+			 * outputs for the WindowFuncs computed at each level.  Also,
+			 * each input tlist has to present all the columns needed to
+			 * sort the data for the next WindowAgg step.  That's handled
+			 * internally by make_sort_from_pathkeys, but we need the
+			 * copyObject steps here to ensure that each plan node has
+			 * a separately modifiable tlist.
+			 */
+			window_tlist = flatten_tlist(tlist);
+			if (parse->hasAggs)
+				window_tlist = add_to_flat_tlist(window_tlist,
+											pull_agg_clause((Node *) tlist));
+			result_plan->targetlist = (List *) copyObject(window_tlist);
+
+			foreach(l, activeWindows)
+			{
+				WindowClause *wc = (WindowClause *) lfirst(l);
+				List	   *window_pathkeys;
+				int			partNumCols;
+				AttrNumber *partColIdx;
+				Oid		   *partOperators;
+				int			ordNumCols;
+				AttrNumber *ordColIdx;
+				Oid		   *ordOperators;
+
+				window_pathkeys = make_pathkeys_for_window(root,
+														   wc,
+														   tlist,
+														   true);
+
+				/*
+				 * This is a bit tricky: we build a sort node even if we don't
+				 * really have to sort.  Even when no explicit sort is needed,
+				 * we need to have suitable resjunk items added to the input
+				 * plan's tlist for any partitioning or ordering columns that
+				 * aren't plain Vars.  Furthermore, this way we can use
+				 * existing infrastructure to identify which input columns are
+				 * the interesting ones.
+				 */
+				if (window_pathkeys)
+				{
+					Sort	   *sort_plan;
+
+					sort_plan = make_sort_from_pathkeys(root,
+														result_plan,
+														window_pathkeys,
+														-1.0);
+					if (!pathkeys_contained_in(window_pathkeys,
+											   current_pathkeys))
+					{
+						/* we do indeed need to sort */
+						result_plan = (Plan *) sort_plan;
+						current_pathkeys = window_pathkeys;
+					}
+					/* In either case, extract the per-column information */
+					get_column_info_for_window(root, wc, tlist,
+											   sort_plan->numCols,
+											   sort_plan->sortColIdx,
+											   &partNumCols,
+											   &partColIdx,
+											   &partOperators,
+											   &ordNumCols,
+											   &ordColIdx,
+											   &ordOperators);
+				}
+				else
+				{
+					/* empty window specification, nothing to sort */
+					partNumCols = 0;
+					partColIdx = NULL;
+					partOperators = NULL;
+					ordNumCols = 0;
+					ordColIdx = NULL;
+					ordOperators = NULL;
+				}
+
+				if (lnext(l))
+				{
+					/* Add the current WindowFuncs to the running tlist */
+					window_tlist = add_to_flat_tlist(window_tlist,
+											wflists->windowFuncs[wc->winref]);
+				}
+				else
+				{
+					/* Install the original tlist in the topmost WindowAgg */
+					window_tlist = tlist;
+				}
+
+				/* ... and make the WindowAgg plan node */
+				result_plan = (Plan *)
+					make_windowagg(root,
+								   (List *) copyObject(window_tlist),
+								   list_length(wflists->windowFuncs[wc->winref]),
+								   partNumCols,
+								   partColIdx,
+								   partOperators,
+								   ordNumCols,
+								   ordColIdx,
+								   ordOperators,
+								   result_plan);
+			}
+		}
 	}							/* end of if (setOperations) */
 
 	/*
@@ -2030,7 +2212,8 @@ make_subplanTargetList(PlannerInfo *root,
 	 * If we're not grouping or aggregating, there's nothing to do here;
 	 * query_planner should receive the unmodified target list.
 	 */
-	if (!parse->hasAggs && !parse->groupClause && !root->hasHavingQual)
+	if (!parse->hasAggs && !parse->groupClause && !root->hasHavingQual &&
+		!parse->hasWindowFuncs)
 	{
 		*need_tlist_eval = true;
 		return tlist;
@@ -2039,7 +2222,9 @@ make_subplanTargetList(PlannerInfo *root,
 	/*
 	 * Otherwise, start with a "flattened" tlist (having just the vars
 	 * mentioned in the targetlist and HAVING qual --- but not upper-level
-	 * Vars; they will be replaced by Params later on).
+	 * Vars; they will be replaced by Params later on).  Note this includes
+	 * vars used in resjunk items, so we are covering the needs of ORDER BY
+	 * and window specifications.
 	 */
 	sub_tlist = flatten_tlist(tlist);
 	extravars = pull_var_clause(parse->havingQual, true);
@@ -2066,7 +2251,7 @@ make_subplanTargetList(PlannerInfo *root,
 		{
 			SortGroupClause *grpcl = (SortGroupClause *) lfirst(gl);
 			Node	   *groupexpr = get_sortgroupclause_expr(grpcl, tlist);
-			TargetEntry *te = NULL;
+			TargetEntry *te;
 
 			/*
 			 * Find or make a matching sub_tlist entry.  If the groupexpr
@@ -2074,20 +2259,10 @@ make_subplanTargetList(PlannerInfo *root,
 			 * won't make multiple groupClause entries for the same TLE.)
 			 */
 			if (groupexpr && IsA(groupexpr, Var))
-			{
-				ListCell   *sl;
-
-				foreach(sl, sub_tlist)
-				{
-					TargetEntry *lte = (TargetEntry *) lfirst(sl);
+				te = tlist_member(groupexpr, sub_tlist);
+			else
+				te = NULL;
 
-					if (equal(groupexpr, lte->expr))
-					{
-						te = lte;
-						break;
-					}
-				}
-			}
 			if (!te)
 			{
 				te = makeTargetEntry((Expr *) groupexpr,
@@ -2112,7 +2287,7 @@ make_subplanTargetList(PlannerInfo *root,
  *
  * This is only needed if we don't use the sub_tlist chosen by
  * make_subplanTargetList.	We have to forget the column indexes found
- * by that routine and re-locate the grouping vars in the real sub_tlist.
+ * by that routine and re-locate the grouping exprs in the real sub_tlist.
  */
 static void
 locate_grouping_columns(PlannerInfo *root,
@@ -2137,18 +2312,10 @@ locate_grouping_columns(PlannerInfo *root,
 	{
 		SortGroupClause *grpcl = (SortGroupClause *) lfirst(gl);
 		Node	   *groupexpr = get_sortgroupclause_expr(grpcl, tlist);
-		TargetEntry *te = NULL;
-		ListCell   *sl;
+		TargetEntry *te = tlist_member(groupexpr, sub_tlist);
 
-		foreach(sl, sub_tlist)
-		{
-			te = (TargetEntry *) lfirst(sl);
-			if (equal(groupexpr, te->expr))
-				break;
-		}
-		if (!sl)
+		if (!te)
 			elog(ERROR, "failed to locate grouping columns");
-
 		groupColIdx[keyno++] = te->resno;
 	}
 }
@@ -2190,3 +2357,219 @@ postprocess_setop_tlist(List *new_tlist, List *orig_tlist)
 		elog(ERROR, "resjunk output columns are not implemented");
 	return new_tlist;
 }
+
+/*
+ * select_active_windows
+ *		Create a list of the "active" window clauses (ie, those referenced
+ *		by non-deleted WindowFuncs) in the order they are to be executed.
+ */
+static List *
+select_active_windows(PlannerInfo *root, WindowFuncLists *wflists)
+{
+	List	   *result;
+	List	   *actives;
+	ListCell   *lc;
+
+	/* First, make a list of the active windows */
+	actives = NIL;
+	foreach(lc, root->parse->windowClause)
+	{
+		WindowClause *wc = (WindowClause *) lfirst(lc);
+
+		/* It's only active if wflists shows some related WindowFuncs */
+		Assert(wc->winref <= wflists->maxWinRef);
+		if (wflists->windowFuncs[wc->winref] != NIL)
+			actives = lappend(actives, wc);
+	}
+
+	/*
+	 * Now, ensure that windows with identical partitioning/ordering clauses
+	 * are adjacent in the list.  This is required by the SQL standard, which
+	 * says that only one sort is to be used for such windows, even if they
+	 * are otherwise distinct (eg, different names or framing clauses).
+	 *
+	 * There is room to be much smarter here, for example detecting whether
+	 * one window's sort keys are a prefix of another's (so that sorting
+	 * for the latter would do for the former), or putting windows first
+	 * that match a sort order available for the underlying query.  For the
+	 * moment we are content with meeting the spec.
+	 */
+	result = NIL;
+	while (actives != NIL)
+	{
+		WindowClause *wc = (WindowClause *) linitial(actives);
+		ListCell   *prev;
+		ListCell   *next;
+
+		/* Move wc from actives to result */
+		actives = list_delete_first(actives);
+		result = lappend(result, wc);
+
+		/* Now move any matching windows from actives to result */
+		prev = NULL;
+		for (lc = list_head(actives); lc; lc = next)
+		{
+			WindowClause *wc2 = (WindowClause *) lfirst(lc);
+
+			next = lnext(lc);
+			if (equal(wc->partitionClause, wc2->partitionClause) &&
+				equal(wc->orderClause, wc2->orderClause))
+			{
+				actives = list_delete_cell(actives, lc, prev);
+				result = lappend(result, wc2);
+			}
+			else
+				prev = lc;
+		}
+	}
+
+	return result;
+}
+
+/*
+ * make_pathkeys_for_window
+ *		Create a pathkeys list describing the required input ordering
+ *		for the given WindowClause.
+ *
+ * The required ordering is first the PARTITION keys, then the ORDER keys.
+ * In the future we might try to implement windowing using hashing, in which
+ * case the ordering could be relaxed, but for now we always sort.
+ */
+static List *
+make_pathkeys_for_window(PlannerInfo *root, WindowClause *wc,
+						 List *tlist, bool canonicalize)
+{
+	List	   *window_pathkeys;
+	List	   *window_sortclauses;
+
+	/* Throw error if can't sort */
+	if (!grouping_is_sortable(wc->partitionClause))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("could not implement window PARTITION BY"),
+				 errdetail("Window partitioning columns must be of sortable datatypes.")));
+	if (!grouping_is_sortable(wc->orderClause))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("could not implement window ORDER BY"),
+				 errdetail("Window ordering columns must be of sortable datatypes.")));
+
+	/* Okay, make the combined pathkeys */
+	window_sortclauses = list_concat(list_copy(wc->partitionClause),
+									 list_copy(wc->orderClause));
+	window_pathkeys = make_pathkeys_for_sortclauses(root,
+													window_sortclauses,
+													tlist,
+													canonicalize);
+	list_free(window_sortclauses);
+	return window_pathkeys;
+}
+
+/*----------
+ * get_column_info_for_window
+ *		Get the partitioning/ordering column numbers and equality operators
+ *		for a WindowAgg node.
+ *
+ * This depends on the behavior of make_pathkeys_for_window()!
+ *
+ * We are given the target WindowClause and an array of the input column
+ * numbers associated with the resulting pathkeys.  In the easy case, there
+ * are the same number of pathkey columns as partitioning + ordering columns
+ * and we just have to copy some data around.  However, it's possible that
+ * some of the original partitioning + ordering columns were eliminated as
+ * redundant during the transformation to pathkeys.  (This can happen even
+ * though the parser gets rid of obvious duplicates.  A typical scenario is a
+ * window specification "PARTITION BY x ORDER BY y" coupled with a clause
+ * "WHERE x = y" that causes the two sort columns to be recognized as
+ * redundant.)  In that unusual case, we have to work a lot harder to
+ * determine which keys are significant.
+ *
+ * The method used here is a bit brute-force: add the sort columns to a list
+ * one at a time and note when the resulting pathkey list gets longer.  But
+ * it's a sufficiently uncommon case that a faster way doesn't seem worth
+ * the amount of code refactoring that'd be needed.
+ *----------
+ */
+static void
+get_column_info_for_window(PlannerInfo *root, WindowClause *wc, List *tlist,
+						   int numSortCols, AttrNumber *sortColIdx,
+						   int *partNumCols,
+						   AttrNumber **partColIdx,
+						   Oid **partOperators,
+						   int *ordNumCols,
+						   AttrNumber **ordColIdx,
+						   Oid **ordOperators)
+{
+	int			numPart = list_length(wc->partitionClause);
+	int			numOrder = list_length(wc->orderClause);
+
+	if (numSortCols == numPart + numOrder)
+	{
+		/* easy case */
+		*partNumCols = numPart;
+		*partColIdx = sortColIdx;
+		*partOperators = extract_grouping_ops(wc->partitionClause);
+		*ordNumCols = numOrder;
+		*ordColIdx = sortColIdx + numPart;
+		*ordOperators = extract_grouping_ops(wc->orderClause);
+	}
+	else
+	{
+		List	   *sortclauses;
+		List	   *pathkeys;
+		int			scidx;
+		ListCell   *lc;
+
+		/* first, allocate what's certainly enough space for the arrays */
+		*partNumCols = 0;
+		*partColIdx = (AttrNumber *) palloc(numPart * sizeof(AttrNumber));
+		*partOperators = (Oid *) palloc(numPart * sizeof(Oid));
+		*ordNumCols = 0;
+		*ordColIdx = (AttrNumber *) palloc(numOrder * sizeof(AttrNumber));
+		*ordOperators = (Oid *) palloc(numOrder * sizeof(Oid));
+		sortclauses = NIL;
+		pathkeys = NIL;
+		scidx = 0;
+		foreach(lc, wc->partitionClause)
+		{
+			SortGroupClause *sgc = (SortGroupClause *) lfirst(lc);
+			List	   *new_pathkeys;
+
+			sortclauses = lappend(sortclauses, sgc);
+			new_pathkeys = make_pathkeys_for_sortclauses(root,
+														 sortclauses,
+														 tlist,
+														 true);
+			if (list_length(new_pathkeys) > list_length(pathkeys))
+			{
+				/* this sort clause is actually significant */
+				*partColIdx[*partNumCols] = sortColIdx[scidx++];
+				*partOperators[*partNumCols] = sgc->eqop;
+				(*partNumCols)++;
+				pathkeys = new_pathkeys;
+			}
+		}
+		foreach(lc, wc->orderClause)
+		{
+			SortGroupClause *sgc = (SortGroupClause *) lfirst(lc);
+			List	   *new_pathkeys;
+
+			sortclauses = lappend(sortclauses, sgc);
+			new_pathkeys = make_pathkeys_for_sortclauses(root,
+														 sortclauses,
+														 tlist,
+														 true);
+			if (list_length(new_pathkeys) > list_length(pathkeys))
+			{
+				/* this sort clause is actually significant */
+				*ordColIdx[*ordNumCols] = sortColIdx[scidx++];
+				*ordOperators[*ordNumCols] = sgc->eqop;
+				(*ordNumCols)++;
+				pathkeys = new_pathkeys;
+			}
+		}
+		/* complain if we didn't eat exactly the right number of sort cols */
+		if (scidx != numSortCols)
+			elog(ERROR, "failed to deconstruct sort operators into partitioning/ordering operators");
+	}
+}
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index 9bec109f6f5..83447082f5b 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/setrefs.c,v 1.146 2008/10/21 20:42:53 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/setrefs.c,v 1.147 2008/12/28 18:53:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -415,6 +415,7 @@ set_plan_refs(PlannerGlobal *glob, Plan *plan, int rtoffset)
 			}
 			break;
 		case T_Agg:
+		case T_WindowAgg:
 		case T_Group:
 			set_upper_references(glob, plan, rtoffset);
 			break;
@@ -679,6 +680,11 @@ fix_expr_common(PlannerGlobal *glob, Node *node)
 		record_plan_function_dependency(glob,
 										((Aggref *) node)->aggfnoid);
 	}
+	else if (IsA(node, WindowFunc))
+	{
+		record_plan_function_dependency(glob,
+										((WindowFunc *) node)->winfnoid);
+	}
 	else if (IsA(node, FuncExpr))
 	{
 		record_plan_function_dependency(glob,
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index c999fb6419c..a38f8c09ae7 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/subselect.c,v 1.143 2008/12/08 00:16:09 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/subselect.c,v 1.144 2008/12/28 18:53:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1243,6 +1243,7 @@ simplify_EXISTS_query(Query *query)
 		query->intoClause ||
 		query->setOperations ||
 		query->hasAggs ||
+		query->hasWindowFuncs ||
 		query->havingQual ||
 		query->limitOffset ||
 		query->limitCount ||
@@ -1258,13 +1259,14 @@ simplify_EXISTS_query(Query *query)
 
 	/*
 	 * Otherwise, we can throw away the targetlist, as well as any GROUP,
-	 * DISTINCT, and ORDER BY clauses; none of those clauses will change
-	 * a nonzero-rows result to zero rows or vice versa.  (Furthermore,
+	 * WINDOW, DISTINCT, and ORDER BY clauses; none of those clauses will
+	 * change a nonzero-rows result to zero rows or vice versa.  (Furthermore,
 	 * since our parsetree representation of these clauses depends on the
 	 * targetlist, we'd better throw them away if we drop the targetlist.)
 	 */
 	query->targetList = NIL;
 	query->groupClause = NIL;
+	query->windowClause = NIL;
 	query->distinctClause = NIL;
 	query->sortClause = NIL;
 	query->hasDistinctOn = false;
@@ -1321,8 +1323,8 @@ convert_EXISTS_to_ANY(PlannerInfo *root, Query *subselect,
 	 * The rest of the sub-select must not refer to any Vars of the parent
 	 * query.  (Vars of higher levels should be okay, though.)
 	 *
-	 * Note: we need not check for Aggs separately because we know the
-	 * sub-select is as yet unoptimized; any uplevel Agg must therefore
+	 * Note: we need not check for Aggrefs separately because we know the
+	 * sub-select is as yet unoptimized; any uplevel Aggref must therefore
 	 * contain an uplevel Var reference.  This is not the case below ...
 	 */
 	if (contain_vars_of_level((Node *) subselect, 1))
@@ -1432,7 +1434,8 @@ convert_EXISTS_to_ANY(PlannerInfo *root, Query *subselect,
 	/*
 	 * And there can't be any child Vars in the stuff we intend to pull up.
 	 * (Note: we'd need to check for child Aggs too, except we know the
-	 * child has no aggs at all because of simplify_EXISTS_query's check.)
+	 * child has no aggs at all because of simplify_EXISTS_query's check.
+	 * The same goes for window functions.)
 	 */
 	if (contain_vars_of_level((Node *) leftargs, 0))
 		return NULL;
@@ -1955,6 +1958,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params)
 		case T_RecursiveUnion:
 		case T_Hash:
 		case T_Agg:
+		case T_WindowAgg:
 		case T_SeqScan:
 		case T_Material:
 		case T_Sort:
diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c
index e4d508523e1..80a51d80786 100644
--- a/src/backend/optimizer/prep/prepjointree.c
+++ b/src/backend/optimizer/prep/prepjointree.c
@@ -16,7 +16,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/prep/prepjointree.c,v 1.60 2008/11/11 19:05:21 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/prep/prepjointree.c,v 1.61 2008/12/28 18:53:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -742,7 +742,10 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte,
 	 * Miscellaneous housekeeping.
 	 */
 	parse->hasSubLinks |= subquery->hasSubLinks;
-	/* subquery won't be pulled up if it hasAggs, so no work there */
+	/*
+	 * subquery won't be pulled up if it hasAggs or hasWindowFuncs, so no
+	 * work needed on those flags
+	 */
 
 	/*
 	 * Return the adjusted subquery jointree to replace the RangeTblRef entry
@@ -931,6 +934,7 @@ is_simple_subquery(Query *subquery)
 	 * limiting, or WITH.  (XXX WITH could possibly be allowed later)
 	 */
 	if (subquery->hasAggs ||
+		subquery->hasWindowFuncs ||
 		subquery->groupClause ||
 		subquery->havingQual ||
 		subquery->sortClause ||
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index bd7c05cc53d..f3a49cf9dee 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -22,7 +22,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.162 2008/11/15 19:43:46 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.163 2008/12/28 18:53:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -136,6 +136,7 @@ plan_set_operations(PlannerInfo *root, double tuple_fraction,
 	Assert(parse->jointree->quals == NULL);
 	Assert(parse->groupClause == NIL);
 	Assert(parse->havingQual == NULL);
+	Assert(parse->windowClause == NIL);
 	Assert(parse->distinctClause == NIL);
 
 	/*
diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c
index 3c74831f4da..ee45f32abbb 100644
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/clauses.c,v 1.271 2008/12/18 18:20:34 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/clauses.c,v 1.272 2008/12/28 18:53:57 tgl Exp $
  *
  * HISTORY
  *	  AUTHOR			DATE			MAJOR EVENT
@@ -72,7 +72,9 @@ typedef struct
 } substitute_actual_srf_parameters_context;
 
 static bool contain_agg_clause_walker(Node *node, void *context);
+static bool pull_agg_clause_walker(Node *node, List **context);
 static bool count_agg_clauses_walker(Node *node, AggClauseCounts *counts);
+static bool find_window_functions_walker(Node *node, WindowFuncLists *lists);
 static bool expression_returns_set_rows_walker(Node *node, double *count);
 static bool contain_subplans_walker(Node *node, void *context);
 static bool contain_mutable_functions_walker(Node *node, void *context);
@@ -389,6 +391,41 @@ contain_agg_clause_walker(Node *node, void *context)
 }
 
 /*
+ * pull_agg_clause
+ *	  Recursively search for Aggref nodes within a clause.
+ *
+ *	  Returns a List of all Aggrefs found.
+ *
+ * This does not descend into subqueries, and so should be used only after
+ * reduction of sublinks to subplans, or in contexts where it's known there
+ * are no subqueries.  There mustn't be outer-aggregate references either.
+ */
+List *
+pull_agg_clause(Node *clause)
+{
+	List	   *result = NIL;
+
+	(void) pull_agg_clause_walker(clause, &result);
+	return result;
+}
+
+static bool
+pull_agg_clause_walker(Node *node, List **context)
+{
+	if (node == NULL)
+		return false;
+	if (IsA(node, Aggref))
+	{
+		Assert(((Aggref *) node)->agglevelsup == 0);
+		*context = lappend(*context, node);
+		return false;			/* no need to descend into arguments */
+	}
+	Assert(!IsA(node, SubLink));
+	return expression_tree_walker(node, pull_agg_clause_walker,
+								  (void *) context);
+}
+
+/*
  * count_agg_clauses
  *	  Recursively count the Aggref nodes in an expression tree.
  *
@@ -520,6 +557,79 @@ count_agg_clauses_walker(Node *node, AggClauseCounts *counts)
 
 
 /*****************************************************************************
+ *		Window-function clause manipulation
+ *****************************************************************************/
+
+/*
+ * contain_window_function
+ *	  Recursively search for WindowFunc nodes within a clause.
+ *
+ * Since window functions don't have level fields, but are hard-wired to
+ * be associated with the current query level, this is just the same as
+ * rewriteManip.c's function.
+ */
+bool
+contain_window_function(Node *clause)
+{
+	return checkExprHasWindowFuncs(clause);
+}
+
+/*
+ * find_window_functions
+ *	  Locate all the WindowFunc nodes in an expression tree, and organize
+ *	  them by winref ID number.
+ *
+ * Caller must provide an upper bound on the winref IDs expected in the tree.
+ */
+WindowFuncLists *
+find_window_functions(Node *clause, Index maxWinRef)
+{
+	WindowFuncLists *lists = palloc(sizeof(WindowFuncLists));
+
+	lists->numWindowFuncs = 0;
+	lists->maxWinRef = maxWinRef;
+	lists->windowFuncs = (List **) palloc0((maxWinRef + 1) * sizeof(List *));
+	(void) find_window_functions_walker(clause, lists);
+	return lists;
+}
+
+static bool
+find_window_functions_walker(Node *node, WindowFuncLists *lists)
+{
+	if (node == NULL)
+		return false;
+	if (IsA(node, WindowFunc))
+	{
+		WindowFunc *wfunc = (WindowFunc *) node;
+
+		/* winref is unsigned, so one-sided test is OK */
+		if (wfunc->winref > lists->maxWinRef)
+			elog(ERROR, "WindowFunc contains out-of-range winref %u",
+				 wfunc->winref);
+		lists->windowFuncs[wfunc->winref] =
+			lappend(lists->windowFuncs[wfunc->winref], wfunc);
+		lists->numWindowFuncs++;
+
+		/*
+		 * Complain if the window function's arguments contain window functions
+		 */
+		if (contain_window_function((Node *) wfunc->args))
+			ereport(ERROR,
+					(errcode(ERRCODE_WINDOWING_ERROR),
+					 errmsg("window function calls cannot be nested")));
+
+		/*
+		 * Having checked that, we need not recurse into the argument.
+		 */
+		return false;
+	}
+	Assert(!IsA(node, SubLink));
+	return expression_tree_walker(node, find_window_functions_walker,
+								  (void *) lists);
+}
+
+
+/*****************************************************************************
  *		Support for expressions returning sets
  *****************************************************************************/
 
@@ -567,6 +677,8 @@ expression_returns_set_rows_walker(Node *node, double *count)
 	/* Avoid recursion for some cases that can't return a set */
 	if (IsA(node, Aggref))
 		return false;
+	if (IsA(node, WindowFunc))
+		return false;
 	if (IsA(node, DistinctExpr))
 		return false;
 	if (IsA(node, ScalarArrayOpExpr))
@@ -897,6 +1009,11 @@ contain_nonstrict_functions_walker(Node *node, void *context)
 		/* an aggregate could return non-null with null input */
 		return true;
 	}
+	if (IsA(node, WindowFunc))
+	{
+		/* a window function could return non-null with null input */
+		return true;
+	}
 	if (IsA(node, ArrayRef))
 	{
 		/* array assignment is nonstrict, but subscripting is strict */
@@ -1589,7 +1706,8 @@ is_strict_saop(ScalarArrayOpExpr *expr, bool falseOK)
  * not-constant expressions, namely aggregates (Aggrefs).  In current usage
  * this is only applied to WHERE clauses and so a check for Aggrefs would be
  * a waste of cycles; but be sure to also check contain_agg_clause() if you
- * want to know about pseudo-constness in other contexts.
+ * want to know about pseudo-constness in other contexts.  The same goes
+ * for window functions (WindowFuncs).
  */
 bool
 is_pseudo_constant_clause(Node *clause)
@@ -3472,6 +3590,7 @@ inline_function(Oid funcid, Oid result_type, List *args,
 		querytree->utilityStmt ||
 		querytree->intoClause ||
 		querytree->hasAggs ||
+		querytree->hasWindowFuncs ||
 		querytree->hasSubLinks ||
 		querytree->cteList ||
 		querytree->rtable ||
@@ -3479,6 +3598,7 @@ inline_function(Oid funcid, Oid result_type, List *args,
 		querytree->jointree->quals ||
 		querytree->groupClause ||
 		querytree->havingQual ||
+		querytree->windowClause ||
 		querytree->distinctClause ||
 		querytree->sortClause ||
 		querytree->limitOffset ||
diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c
index 968f4ae367a..aab3d032b12 100644
--- a/src/backend/optimizer/util/tlist.c
+++ b/src/backend/optimizer/util/tlist.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/tlist.c,v 1.83 2008/10/21 20:42:53 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/tlist.c,v 1.84 2008/12/28 18:53:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -101,28 +101,28 @@ flatten_tlist(List *tlist)
 
 /*
  * add_to_flat_tlist
- *		Add more vars to a flattened tlist (if they're not already in it)
+ *		Add more items to a flattened tlist (if they're not already in it)
  *
  * 'tlist' is the flattened tlist
- * 'vars' is a list of Var and/or PlaceHolderVar nodes
+ * 'exprs' is a list of expressions (usually, but not necessarily, Vars)
  *
  * Returns the extended tlist.
  */
 List *
-add_to_flat_tlist(List *tlist, List *vars)
+add_to_flat_tlist(List *tlist, List *exprs)
 {
 	int			next_resno = list_length(tlist) + 1;
-	ListCell   *v;
+	ListCell   *lc;
 
-	foreach(v, vars)
+	foreach(lc, exprs)
 	{
-		Node	   *var = (Node *) lfirst(v);
+		Node	   *expr = (Node *) lfirst(lc);
 
-		if (!tlist_member(var, tlist))
+		if (!tlist_member(expr, tlist))
 		{
 			TargetEntry *tle;
 
-			tle = makeTargetEntry(copyObject(var),		/* copy needed?? */
+			tle = makeTargetEntry(copyObject(expr),		/* copy needed?? */
 								  next_resno++,
 								  NULL,
 								  false);
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index cdac02b71db..70688655cce 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -17,7 +17,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- *	$PostgreSQL: pgsql/src/backend/parser/analyze.c,v 1.384 2008/12/13 02:00:19 tgl Exp $
+ *	$PostgreSQL: pgsql/src/backend/parser/analyze.c,v 1.385 2008/12/28 18:53:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -306,6 +306,9 @@ transformDeleteStmt(ParseState *pstate, DeleteStmt *stmt)
 	qry->hasAggs = pstate->p_hasAggs;
 	if (pstate->p_hasAggs)
 		parseCheckAggregates(pstate, qry);
+	qry->hasWindowFuncs = pstate->p_hasWindowFuncs;
+	if (pstate->p_hasWindowFuncs)
+		parseCheckWindowFuncs(pstate, qry);
 
 	return qry;
 }
@@ -673,6 +676,12 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
 				 errmsg("cannot use aggregate function in VALUES"),
 				 parser_errposition(pstate,
 									locate_agg_of_level((Node *) qry, 0))));
+	if (pstate->p_hasWindowFuncs)
+		ereport(ERROR,
+				(errcode(ERRCODE_WINDOWING_ERROR),
+				 errmsg("cannot use window function in VALUES"),
+				 parser_errposition(pstate,
+									locate_windowfunc((Node *) qry))));
 
 	return qry;
 }
@@ -764,6 +773,9 @@ transformSelectStmt(ParseState *pstate, SelectStmt *stmt)
 	/* make FOR UPDATE/FOR SHARE info available to addRangeTableEntry */
 	pstate->p_locking_clause = stmt->lockingClause;
 
+	/* make WINDOW info available for window functions, too */
+	pstate->p_windowdefs = stmt->windowClause;
+
 	/* process the WITH clause */
 	if (stmt->withClause)
 	{
@@ -803,7 +815,8 @@ transformSelectStmt(ParseState *pstate, SelectStmt *stmt)
 	qry->groupClause = transformGroupClause(pstate,
 											stmt->groupClause,
 											&qry->targetList,
-											qry->sortClause);
+											qry->sortClause,
+											false);
 
 	if (stmt->distinctClause == NIL)
 	{
@@ -834,6 +847,11 @@ transformSelectStmt(ParseState *pstate, SelectStmt *stmt)
 	qry->limitCount = transformLimitClause(pstate, stmt->limitCount,
 										   "LIMIT");
 
+	/* transform window clauses after we have seen all window functions */
+	qry->windowClause = transformWindowDefinitions(pstate,
+												   pstate->p_windowdefs,
+												   &qry->targetList);
+
 	/* handle any SELECT INTO/CREATE TABLE AS spec */
 	if (stmt->intoClause)
 	{
@@ -849,6 +867,9 @@ transformSelectStmt(ParseState *pstate, SelectStmt *stmt)
 	qry->hasAggs = pstate->p_hasAggs;
 	if (pstate->p_hasAggs || qry->groupClause || qry->havingQual)
 		parseCheckAggregates(pstate, qry);
+	qry->hasWindowFuncs = pstate->p_hasWindowFuncs;
+	if (pstate->p_hasWindowFuncs)
+		parseCheckWindowFuncs(pstate, qry);
 
 	foreach(l, stmt->lockingClause)
 	{
@@ -889,6 +910,7 @@ transformValuesClause(ParseState *pstate, SelectStmt *stmt)
 	Assert(stmt->whereClause == NULL);
 	Assert(stmt->groupClause == NIL);
 	Assert(stmt->havingClause == NULL);
+	Assert(stmt->windowClause == NIL);
 	Assert(stmt->op == SETOP_NONE);
 
 	/* process the WITH clause */
@@ -1061,6 +1083,12 @@ transformValuesClause(ParseState *pstate, SelectStmt *stmt)
 				 errmsg("cannot use aggregate function in VALUES"),
 				 parser_errposition(pstate,
 									locate_agg_of_level((Node *) newExprsLists, 0))));
+	if (pstate->p_hasWindowFuncs)
+		ereport(ERROR,
+				(errcode(ERRCODE_WINDOWING_ERROR),
+				 errmsg("cannot use window function in VALUES"),
+				 parser_errposition(pstate,
+									locate_windowfunc((Node *) newExprsLists))));
 
 	return qry;
 }
@@ -1289,6 +1317,9 @@ transformSetOperationStmt(ParseState *pstate, SelectStmt *stmt)
 	qry->hasAggs = pstate->p_hasAggs;
 	if (pstate->p_hasAggs || qry->groupClause || qry->havingQual)
 		parseCheckAggregates(pstate, qry);
+	qry->hasWindowFuncs = pstate->p_hasWindowFuncs;
+	if (pstate->p_hasWindowFuncs)
+		parseCheckWindowFuncs(pstate, qry);
 
 	foreach(l, lockingClause)
 	{
@@ -1623,6 +1654,12 @@ transformUpdateStmt(ParseState *pstate, UpdateStmt *stmt)
 				 errmsg("cannot use aggregate function in UPDATE"),
 				 parser_errposition(pstate,
 									locate_agg_of_level((Node *) qry, 0))));
+	if (pstate->p_hasWindowFuncs)
+		ereport(ERROR,
+				(errcode(ERRCODE_WINDOWING_ERROR),
+				 errmsg("cannot use window function in UPDATE"),
+				 parser_errposition(pstate,
+									locate_windowfunc((Node *) qry))));
 
 	/*
 	 * Now we are done with SELECT-like processing, and can get on with
@@ -1692,6 +1729,7 @@ transformReturningList(ParseState *pstate, List *returningList)
 	List	   *rlist;
 	int			save_next_resno;
 	bool		save_hasAggs;
+	bool		save_hasWindowFuncs;
 	int			length_rtable;
 
 	if (returningList == NIL)
@@ -1708,6 +1746,8 @@ transformReturningList(ParseState *pstate, List *returningList)
 	/* save other state so that we can detect disallowed stuff */
 	save_hasAggs = pstate->p_hasAggs;
 	pstate->p_hasAggs = false;
+	save_hasWindowFuncs = pstate->p_hasWindowFuncs;
+	pstate->p_hasWindowFuncs = false;
 	length_rtable = list_length(pstate->p_rtable);
 
 	/* transform RETURNING identically to a SELECT targetlist */
@@ -1722,6 +1762,12 @@ transformReturningList(ParseState *pstate, List *returningList)
 				 errmsg("cannot use aggregate function in RETURNING"),
 				 parser_errposition(pstate,
 									locate_agg_of_level((Node *) rlist, 0))));
+	if (pstate->p_hasWindowFuncs)
+		ereport(ERROR,
+				(errcode(ERRCODE_WINDOWING_ERROR),
+				 errmsg("cannot use window function in RETURNING"),
+				 parser_errposition(pstate,
+									locate_windowfunc((Node *) rlist))));
 
 	/* no new relation references please */
 	if (list_length(pstate->p_rtable) != length_rtable)
@@ -1748,6 +1794,7 @@ transformReturningList(ParseState *pstate, List *returningList)
 	/* restore state */
 	pstate->p_next_resno = save_next_resno;
 	pstate->p_hasAggs = save_hasAggs;
+	pstate->p_hasWindowFuncs = save_hasWindowFuncs;
 
 	return rlist;
 }
@@ -1883,6 +1930,10 @@ CheckSelectLocking(Query *qry)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("SELECT FOR UPDATE/SHARE is not allowed with aggregate functions")));
+	if (qry->hasWindowFuncs)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("SELECT FOR UPDATE/SHARE is not allowed with window functions")));
 }
 
 /*
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 29eab503198..59b7ada7b43 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/gram.y,v 2.647 2008/12/20 16:02:55 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/gram.y,v 2.648 2008/12/28 18:53:58 tgl Exp $
  *
  * HISTORY
  *	  AUTHOR			DATE			MAJOR EVENT
@@ -158,6 +158,7 @@ static TypeName *TableFuncTypeName(List *columns);
 	DefElem				*defelt;
 	OptionDefElem		*optdef;
 	SortBy				*sortby;
+	WindowDef			*windef;
 	JoinExpr			*jexpr;
 	IndexElem			*ielem;
 	Alias				*alias;
@@ -402,6 +403,10 @@ static TypeName *TableFuncTypeName(List *columns);
 %type <with> 	with_clause
 %type <list>	cte_list
 
+%type <list>	window_clause window_definition_list opt_partition_clause
+%type <windef>	window_definition over_clause window_specification
+%type <str>		opt_existing_window_name
+
 
 /*
  * If you make any token changes, update the keyword table in
@@ -431,8 +436,8 @@ static TypeName *TableFuncTypeName(List *columns);
 	DEFERRABLE DEFERRED DEFINER DELETE_P DELIMITER DELIMITERS DESC
 	DICTIONARY DISABLE_P DISCARD DISTINCT DO DOCUMENT_P DOMAIN_P DOUBLE_P DROP
 
-	EACH ELSE ENABLE_P ENCODING ENCRYPTED END_P ENUM_P ESCAPE EXCEPT EXCLUDING
-	EXCLUSIVE EXECUTE EXISTS EXPLAIN EXTERNAL EXTRACT
+	EACH ELSE ENABLE_P ENCODING ENCRYPTED END_P ENUM_P ESCAPE EXCEPT
+	EXCLUDING EXCLUSIVE EXECUTE EXISTS EXPLAIN EXTERNAL EXTRACT
 
 	FALSE_P FAMILY FETCH FIRST_P FLOAT_P FOR FORCE FOREIGN FORWARD
 	FREEZE FROM FULL FUNCTION
@@ -461,9 +466,9 @@ static TypeName *TableFuncTypeName(List *columns);
 	NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF NULLS_P NUMERIC
 
 	OBJECT_P OF OFF OFFSET OIDS OLD ON ONLY OPERATOR OPTION OPTIONS OR
-	ORDER OUT_P OUTER_P OVERLAPS OVERLAY OWNED OWNER
+	ORDER OUT_P OUTER_P OVER OVERLAPS OVERLAY OWNED OWNER
 
-	PARSER PARTIAL PASSWORD PLACING PLANS POSITION
+	PARSER PARTIAL PARTITION PASSWORD PLACING PLANS POSITION
 	PRECISION PRESERVE PREPARE PREPARED PRIMARY
 	PRIOR PRIVILEGES PROCEDURAL PROCEDURE
 
@@ -489,7 +494,7 @@ static TypeName *TableFuncTypeName(List *columns);
 	VACUUM VALID VALIDATOR VALUE_P VALUES VARCHAR VARIADIC VARYING
 	VERBOSE VERSION_P VIEW VOLATILE
 
-	WHEN WHERE WHITESPACE_P WITH WITHOUT WORK WRAPPER WRITE
+	WHEN WHERE WHITESPACE_P WINDOW WITH WITHOUT WORK WRAPPER WRITE
 
 	XML_P XMLATTRIBUTES XMLCONCAT XMLELEMENT XMLFOREST XMLPARSE
 	XMLPI XMLROOT XMLSERIALIZE
@@ -523,7 +528,15 @@ static TypeName *TableFuncTypeName(List *columns);
 %nonassoc	BETWEEN
 %nonassoc	IN_P
 %left		POSTFIXOP		/* dummy for postfix Op rules */
-%nonassoc	IDENT			/* to support target_el without AS */
+/*
+ * To support target_el without AS, we must give IDENT an explicit priority
+ * between POSTFIXOP and Op.  We can safely assign the same priority to
+ * various unreserved keywords as needed to resolve ambiguities (this can't
+ * have any bad effects since obviously the keywords will still behave the
+ * same as if they weren't keywords).  We need to do this for PARTITION
+ * to support opt_existing_window_name.
+ */
+%nonassoc	IDENT PARTITION
 %left		Op OPERATOR		/* multi-character ops and user-defined operators */
 %nonassoc	NOTNULL
 %nonassoc	ISNULL
@@ -1259,7 +1272,7 @@ opt_boolean:
  * - an integer or floating point number
  * - a time interval per SQL99
  * ColId gives reduce/reduce errors against ConstInterval and LOCAL,
- * so use IDENT and reject anything which is a reserved word.
+ * so use IDENT (meaning we reject anything that is a key word).
  */
 zone_value:
 			Sconst
@@ -3466,6 +3479,11 @@ old_aggr_list: old_aggr_elem						{ $$ = list_make1($1); }
 			| old_aggr_list ',' old_aggr_elem		{ $$ = lappend($1, $3); }
 		;
 
+/*
+ * Must use IDENT here to avoid reduce/reduce conflicts; fortunately none of
+ * the item names needed in old aggregate definitions are likely to become
+ * SQL keywords.
+ */
 old_aggr_elem:  IDENT '=' def_arg
 				{
 					$$ = makeDefElem($1, (Node *)$3);
@@ -6825,7 +6843,7 @@ select_clause:
 simple_select:
 			SELECT opt_distinct target_list
 			into_clause from_clause where_clause
-			group_clause having_clause
+			group_clause having_clause window_clause
 				{
 					SelectStmt *n = makeNode(SelectStmt);
 					n->distinctClause = $2;
@@ -6835,6 +6853,7 @@ simple_select:
 					n->whereClause = $6;
 					n->groupClause = $7;
 					n->havingClause = $8;
+					n->windowClause = $9;
 					$$ = (Node *)n;
 				}
 			| values_clause							{ $$ = $1; }
@@ -8076,6 +8095,7 @@ a_expr:		c_expr									{ $$ = $1; }
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @2;
 					$$ = (Node *) n;
 				}
@@ -8135,6 +8155,7 @@ a_expr:		c_expr									{ $$ = $1; }
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @4;
 					$$ = (Node *) makeSimpleA_Expr(AEXPR_OP, "~~", $1, (Node *) n, @2);
 				}
@@ -8148,6 +8169,7 @@ a_expr:		c_expr									{ $$ = $1; }
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @5;
 					$$ = (Node *) makeSimpleA_Expr(AEXPR_OP, "!~~", $1, (Node *) n, @2);
 				}
@@ -8161,6 +8183,7 @@ a_expr:		c_expr									{ $$ = $1; }
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @4;
 					$$ = (Node *) makeSimpleA_Expr(AEXPR_OP, "~~*", $1, (Node *) n, @2);
 				}
@@ -8174,6 +8197,7 @@ a_expr:		c_expr									{ $$ = $1; }
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @5;
 					$$ = (Node *) makeSimpleA_Expr(AEXPR_OP, "!~~*", $1, (Node *) n, @2);
 				}
@@ -8186,6 +8210,7 @@ a_expr:		c_expr									{ $$ = $1; }
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @2;
 					$$ = (Node *) makeSimpleA_Expr(AEXPR_OP, "~", $1, (Node *) n, @2);
 				}
@@ -8197,6 +8222,7 @@ a_expr:		c_expr									{ $$ = $1; }
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @5;
 					$$ = (Node *) makeSimpleA_Expr(AEXPR_OP, "~", $1, (Node *) n, @2);
 				}
@@ -8208,6 +8234,7 @@ a_expr:		c_expr									{ $$ = $1; }
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @5;
 					$$ = (Node *) makeSimpleA_Expr(AEXPR_OP, "!~", $1, (Node *) n, @2);
 				}
@@ -8219,6 +8246,7 @@ a_expr:		c_expr									{ $$ = $1; }
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @6;
 					$$ = (Node *) makeSimpleA_Expr(AEXPR_OP, "!~", $1, (Node *) n, @2);
 				}
@@ -8622,7 +8650,7 @@ c_expr:		columnref								{ $$ = $1; }
  * (Note that many of the special SQL functions wouldn't actually make any
  * sense as functional index entries, but we ignore that consideration here.)
  */
-func_expr:	func_name '(' ')'
+func_expr:	func_name '(' ')' over_clause
 				{
 					FuncCall *n = makeNode(FuncCall);
 					n->funcname = $1;
@@ -8630,10 +8658,11 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = $4;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
-			| func_name '(' expr_list ')'
+			| func_name '(' expr_list ')' over_clause
 				{
 					FuncCall *n = makeNode(FuncCall);
 					n->funcname = $1;
@@ -8641,10 +8670,11 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = $5;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
-			| func_name '(' VARIADIC a_expr ')'
+			| func_name '(' VARIADIC a_expr ')' over_clause
 				{
 					FuncCall *n = makeNode(FuncCall);
 					n->funcname = $1;
@@ -8652,10 +8682,11 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = TRUE;
+					n->over = $6;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
-			| func_name '(' expr_list ',' VARIADIC a_expr ')'
+			| func_name '(' expr_list ',' VARIADIC a_expr ')' over_clause
 				{
 					FuncCall *n = makeNode(FuncCall);
 					n->funcname = $1;
@@ -8663,10 +8694,11 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = TRUE;
+					n->over = $8;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
-			| func_name '(' ALL expr_list ')'
+			| func_name '(' ALL expr_list ')' over_clause
 				{
 					FuncCall *n = makeNode(FuncCall);
 					n->funcname = $1;
@@ -8678,10 +8710,11 @@ func_expr:	func_name '(' ')'
 					 * for that in FuncCall at the moment.
 					 */
 					n->func_variadic = FALSE;
+					n->over = $6;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
-			| func_name '(' DISTINCT expr_list ')'
+			| func_name '(' DISTINCT expr_list ')' over_clause
 				{
 					FuncCall *n = makeNode(FuncCall);
 					n->funcname = $1;
@@ -8689,10 +8722,11 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = TRUE;
 					n->func_variadic = FALSE;
+					n->over = $6;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
-			| func_name '(' '*' ')'
+			| func_name '(' '*' ')' over_clause
 				{
 					/*
 					 * We consider AGGREGATE(*) to invoke a parameterless
@@ -8710,6 +8744,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = TRUE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = $5;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -8769,6 +8804,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -8839,6 +8875,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -8850,6 +8887,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -8861,6 +8899,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -8872,6 +8911,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -8883,6 +8923,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -8894,6 +8935,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -8907,6 +8949,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -8923,6 +8966,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -8935,6 +8979,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -8949,6 +8994,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -8969,6 +9015,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -8983,6 +9030,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -8994,6 +9042,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -9005,6 +9054,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -9016,6 +9066,7 @@ func_expr:	func_name '(' ')'
 					n->agg_star = FALSE;
 					n->agg_distinct = FALSE;
 					n->func_variadic = FALSE;
+					n->over = NULL;
 					n->location = @1;
 					$$ = (Node *)n;
 				}
@@ -9157,6 +9208,77 @@ xml_whitespace_option: PRESERVE WHITESPACE_P		{ $$ = TRUE; }
 		;
 
 /*
+ * Window Definitions
+ */
+window_clause:
+			WINDOW window_definition_list			{ $$ = $2; }
+			| /*EMPTY*/								{ $$ = NIL; }
+		;
+
+window_definition_list:
+			window_definition						{ $$ = list_make1($1); }
+			| window_definition_list ',' window_definition
+													{ $$ = lappend($1, $3); }
+		;
+
+window_definition:
+			ColId AS window_specification
+				{
+					WindowDef *n = $3;
+					n->name = $1;
+					$$ = n;
+				}
+		;
+
+over_clause: OVER window_specification
+				{ $$ = $2; }
+			| OVER ColId
+				{
+					WindowDef *n = makeNode(WindowDef);
+					n->name = NULL;
+					n->refname = $2;
+					n->partitionClause = NIL;
+					n->orderClause = NIL;
+					n->location = @2;
+					$$ = n;
+				}
+			| /*EMPTY*/
+				{ $$ = NULL; }
+		;
+
+window_specification: '(' opt_existing_window_name opt_partition_clause
+						opt_sort_clause ')'
+				{
+					WindowDef *n = makeNode(WindowDef);
+					n->name = NULL;
+					n->refname = $2;
+					n->partitionClause = $3;
+					n->orderClause = $4;
+					n->location = @1;
+					$$ = n;
+				}
+		;
+
+/*
+ * If we see PARTITION, RANGE, or ROWS as the first token after the '('
+ * of a window_specification, we want the assumption to be that there is
+ * no existing_window_name; but those keywords are unreserved and so could
+ * be ColIds.  We fix this by making them have the same precedence as IDENT
+ * and giving the empty production here a slightly higher precedence, so
+ * that the shift/reduce conflict is resolved in favor of reducing the rule.
+ * These keywords are thus precluded from being an existing_window_name but
+ * are not reserved for any other purpose.
+ * (RANGE/ROWS are not an issue as of 8.4 for lack of frame_clause support.)
+ */
+opt_existing_window_name: ColId						{ $$ = $1; }
+			| /*EMPTY*/				%prec Op		{ $$ = NULL; }
+		;
+
+opt_partition_clause: PARTITION BY expr_list		{ $$ = $3; }
+			| /*EMPTY*/								{ $$ = NIL; }
+		;
+
+/*
  * Supporting nonterminals for expressions.
  */
 
@@ -9961,6 +10083,7 @@ unreserved_keyword:
 			| OWNER
 			| PARSER
 			| PARTIAL
+			| PARTITION
 			| PASSWORD
 			| PLANS
 			| PREPARE
@@ -10139,6 +10262,7 @@ type_func_name_keyword:
 			| NATURAL
 			| NOTNULL
 			| OUTER_P
+			| OVER
 			| OVERLAPS
 			| RIGHT
 			| SIMILAR
@@ -10229,6 +10353,7 @@ reserved_keyword:
 			| VARIADIC
 			| WHEN
 			| WHERE
+			| WINDOW
 			| WITH
 		;
 
@@ -10451,6 +10576,7 @@ makeOverlaps(List *largs, List *rargs, int location)
 	n->agg_star = FALSE;
 	n->agg_distinct = FALSE;
 	n->func_variadic = FALSE;
+	n->over = NULL;
 	n->location = location;
 	return n;
 }
diff --git a/src/backend/parser/keywords.c b/src/backend/parser/keywords.c
index bf7b1f6ad2e..c3ad852258b 100644
--- a/src/backend/parser/keywords.c
+++ b/src/backend/parser/keywords.c
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/keywords.c,v 1.206 2008/12/19 16:25:17 petere Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/keywords.c,v 1.207 2008/12/28 18:53:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -287,12 +287,14 @@ const ScanKeyword ScanKeywords[] = {
 	{"order", ORDER, RESERVED_KEYWORD},
 	{"out", OUT_P, COL_NAME_KEYWORD},
 	{"outer", OUTER_P, TYPE_FUNC_NAME_KEYWORD},
+	{"over", OVER, TYPE_FUNC_NAME_KEYWORD},
 	{"overlaps", OVERLAPS, TYPE_FUNC_NAME_KEYWORD},
 	{"overlay", OVERLAY, COL_NAME_KEYWORD},
 	{"owned", OWNED, UNRESERVED_KEYWORD},
 	{"owner", OWNER, UNRESERVED_KEYWORD},
 	{"parser", PARSER, UNRESERVED_KEYWORD},
 	{"partial", PARTIAL, UNRESERVED_KEYWORD},
+	{"partition", PARTITION, UNRESERVED_KEYWORD},
 	{"password", PASSWORD, UNRESERVED_KEYWORD},
 	{"placing", PLACING, RESERVED_KEYWORD},
 	{"plans", PLANS, UNRESERVED_KEYWORD},
@@ -411,6 +413,7 @@ const ScanKeyword ScanKeywords[] = {
 	{"when", WHEN, RESERVED_KEYWORD},
 	{"where", WHERE, RESERVED_KEYWORD},
 	{"whitespace", WHITESPACE_P, UNRESERVED_KEYWORD},
+	{"window", WINDOW, RESERVED_KEYWORD},
 	{"with", WITH, RESERVED_KEYWORD},
 	{"without", WITHOUT, UNRESERVED_KEYWORD},
 	{"work", WORK, UNRESERVED_KEYWORD},
diff --git a/src/backend/parser/parse_agg.c b/src/backend/parser/parse_agg.c
index e2645462d57..6dba470e39f 100644
--- a/src/backend/parser/parse_agg.c
+++ b/src/backend/parser/parse_agg.c
@@ -1,14 +1,14 @@
 /*-------------------------------------------------------------------------
  *
  * parse_agg.c
- *	  handle aggregates in parser
+ *	  handle aggregates and window functions in parser
  *
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/parse_agg.c,v 1.84 2008/10/04 21:56:54 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/parse_agg.c,v 1.85 2008/12/28 18:53:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -67,7 +67,8 @@ transformAggregateCall(ParseState *pstate, Aggref *agg)
 	 */
 	if (min_varlevel == 0)
 	{
-		if (checkExprHasAggs((Node *) agg->args))
+		if (pstate->p_hasAggs &&
+			checkExprHasAggs((Node *) agg->args))
 			ereport(ERROR,
 					(errcode(ERRCODE_GROUPING_ERROR),
 					 errmsg("aggregate function calls cannot be nested"),
@@ -75,6 +76,15 @@ transformAggregateCall(ParseState *pstate, Aggref *agg)
 										locate_agg_of_level((Node *) agg->args, 0))));
 	}
 
+	/* It can't contain window functions either */
+	if (pstate->p_hasWindowFuncs &&
+		checkExprHasWindowFuncs((Node *) agg->args))
+		ereport(ERROR,
+				(errcode(ERRCODE_GROUPING_ERROR),
+				 errmsg("aggregate function calls cannot contain window function calls"),
+				 parser_errposition(pstate,
+									locate_windowfunc((Node *) agg->args))));
+
 	if (min_varlevel < 0)
 		min_varlevel = 0;
 	agg->agglevelsup = min_varlevel;
@@ -85,6 +95,98 @@ transformAggregateCall(ParseState *pstate, Aggref *agg)
 	pstate->p_hasAggs = true;
 }
 
+/*
+ * transformWindowFuncCall -
+ *		Finish initial transformation of a window function call
+ *
+ * parse_func.c has recognized the function as a window function, and has set
+ * up all the fields of the WindowFunc except winref.  Here we must (1) add
+ * the WindowDef to the pstate (if not a duplicate of one already present) and
+ * set winref to link to it; and (2) mark p_hasWindowFuncs true in the pstate.
+ * Unlike aggregates, only the most closely nested pstate level need be
+ * considered --- there are no "outer window functions" per SQL spec.
+ */
+void
+transformWindowFuncCall(ParseState *pstate, WindowFunc *wfunc,
+						WindowDef *windef)
+{
+	/*
+	 * A window function call can't contain another one (but aggs are OK).
+	 * XXX is this required by spec, or just an unimplemented feature?
+	 */
+	if (pstate->p_hasWindowFuncs &&
+		checkExprHasWindowFuncs((Node *) wfunc->args))
+		ereport(ERROR,
+				(errcode(ERRCODE_WINDOWING_ERROR),
+				 errmsg("window function calls cannot be nested"),
+				 parser_errposition(pstate,
+									locate_windowfunc((Node *) wfunc->args))));
+
+	/*
+	 * If the OVER clause just specifies a reference name, find that
+	 * WINDOW clause (which had better be present).  Otherwise, try to
+	 * match all the properties of the OVER clause, and make a new entry
+	 * in the p_windowdefs list if no luck.
+	 */
+	Assert(!windef->name);
+	if (windef->refname &&
+		windef->partitionClause == NIL &&
+		windef->orderClause == NIL)
+	{
+		Index		winref = 0;
+		ListCell   *lc;
+
+		foreach(lc, pstate->p_windowdefs)
+		{
+			WindowDef *refwin = (WindowDef *) lfirst(lc);
+
+			winref++;
+			if (refwin->name && strcmp(refwin->name, windef->refname) == 0)
+			{
+				wfunc->winref = winref;
+				break;
+			}
+		}
+		if (lc == NULL)			/* didn't find it? */
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_OBJECT),
+					 errmsg("window \"%s\" does not exist", windef->refname),
+					 parser_errposition(pstate, windef->location)));
+	}
+	else
+	{
+		Index		winref = 0;
+		ListCell   *lc;
+
+		foreach(lc, pstate->p_windowdefs)
+		{
+			WindowDef *refwin = (WindowDef *) lfirst(lc);
+
+			winref++;
+			if (refwin->refname && windef->refname &&
+				strcmp(refwin->name, windef->refname) == 0)
+				/* matched on refname */ ;
+			else if (!refwin->refname && !windef->refname)
+				/* matched, no refname */ ;
+			else
+				continue;
+			if (equal(refwin->partitionClause, windef->partitionClause) &&
+				equal(refwin->orderClause, windef->orderClause))
+			{
+				/* found a duplicate window specification */
+				wfunc->winref = winref;
+				break;
+			}
+		}
+		if (lc == NULL)			/* didn't find it? */
+		{
+			pstate->p_windowdefs = lappend(pstate->p_windowdefs, windef);
+			wfunc->winref = list_length(pstate->p_windowdefs);
+		}
+	}
+
+	pstate->p_hasWindowFuncs = true;
+}
 
 /*
  * parseCheckAggregates
@@ -207,6 +309,11 @@ parseCheckAggregates(ParseState *pstate, Query *qry)
 
 	/*
 	 * Check the targetlist and HAVING clause for ungrouped variables.
+	 *
+	 * Note: because we check resjunk tlist elements as well as regular ones,
+	 * this will also find ungrouped variables that came from ORDER BY and
+	 * WINDOW clauses.  For that matter, it's also going to examine the
+	 * grouping expressions themselves --- but they'll all pass the test ...
 	 */
 	clause = (Node *) qry->targetList;
 	if (hasJoinRTEs)
@@ -226,11 +333,94 @@ parseCheckAggregates(ParseState *pstate, Query *qry)
 	if (pstate->p_hasAggs && hasSelfRefRTEs)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_RECURSION),
-				 errmsg("aggregates not allowed in a recursive query's recursive term"),
+				 errmsg("aggregate functions not allowed in a recursive query's recursive term"),
 				 parser_errposition(pstate,
 									locate_agg_of_level((Node *) qry, 0))));
 }
 
+/*
+ * parseCheckWindowFuncs
+ *	Check for window functions where they shouldn't be.
+ *
+ *	We have to forbid window functions in WHERE, JOIN/ON, HAVING, GROUP BY,
+ *	and window specifications.  (Other clauses, such as RETURNING and LIMIT,
+ *	have already been checked.)  Transformation of all these clauses must
+ *	be completed already.
+ */
+void
+parseCheckWindowFuncs(ParseState *pstate, Query *qry)
+{
+	ListCell	   *l;
+
+	/* This should only be called if we found window functions */
+	Assert(pstate->p_hasWindowFuncs);
+
+	if (checkExprHasWindowFuncs(qry->jointree->quals))
+		ereport(ERROR,
+				(errcode(ERRCODE_WINDOWING_ERROR),
+				 errmsg("window functions not allowed in WHERE clause"),
+				 parser_errposition(pstate,
+									locate_windowfunc(qry->jointree->quals))));
+	if (checkExprHasWindowFuncs((Node *) qry->jointree->fromlist))
+		ereport(ERROR,
+				(errcode(ERRCODE_WINDOWING_ERROR),
+				 errmsg("window functions not allowed in JOIN conditions"),
+				 parser_errposition(pstate,
+									locate_windowfunc((Node *) qry->jointree->fromlist))));
+	if (checkExprHasWindowFuncs(qry->havingQual))
+		ereport(ERROR,
+				(errcode(ERRCODE_WINDOWING_ERROR),
+				 errmsg("window functions not allowed in HAVING clause"),
+				 parser_errposition(pstate,
+									locate_windowfunc(qry->havingQual))));
+
+	foreach(l, qry->groupClause)
+	{
+		SortGroupClause *grpcl = (SortGroupClause *) lfirst(l);
+		Node	   *expr;
+
+		expr = get_sortgroupclause_expr(grpcl, qry->targetList);
+		if (checkExprHasWindowFuncs(expr))
+			ereport(ERROR,
+					(errcode(ERRCODE_WINDOWING_ERROR),
+					 errmsg("window functions not allowed in GROUP BY clause"),
+					 parser_errposition(pstate,
+										locate_windowfunc(expr))));
+	}
+
+	foreach(l, qry->windowClause)
+	{
+		WindowClause   *wc = (WindowClause *) lfirst(l);
+		ListCell   *l2;
+
+		foreach(l2, wc->partitionClause)
+		{
+			SortGroupClause *grpcl = (SortGroupClause *) lfirst(l2);
+			Node	   *expr;
+
+			expr = get_sortgroupclause_expr(grpcl, qry->targetList);
+			if (checkExprHasWindowFuncs(expr))
+				ereport(ERROR,
+						(errcode(ERRCODE_WINDOWING_ERROR),
+						 errmsg("window functions not allowed in window definition"),
+						 parser_errposition(pstate,
+											locate_windowfunc(expr))));
+		}
+		foreach(l2, wc->orderClause)
+		{
+			SortGroupClause *grpcl = (SortGroupClause *) lfirst(l2);
+			Node	   *expr;
+
+			expr = get_sortgroupclause_expr(grpcl, qry->targetList);
+			if (checkExprHasWindowFuncs(expr))
+				ereport(ERROR,
+						(errcode(ERRCODE_WINDOWING_ERROR),
+						 errmsg("window functions not allowed in window definition"),
+						 parser_errposition(pstate,
+											locate_windowfunc(expr))));
+		}
+	}
+}
 
 /*
  * check_ungrouped_columns -
diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c
index 0e5fbfd28ac..df30361f0a5 100644
--- a/src/backend/parser/parse_clause.c
+++ b/src/backend/parser/parse_clause.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.181 2008/10/06 02:12:56 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.182 2008/12/28 18:53:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -40,8 +40,14 @@
 #define ORDER_CLAUSE 0
 #define GROUP_CLAUSE 1
 #define DISTINCT_ON_CLAUSE 2
+#define PARTITION_CLAUSE 3
 
-static char *clauseText[] = {"ORDER BY", "GROUP BY", "DISTINCT ON"};
+static const char * const clauseText[] = {
+	"ORDER BY",
+	"GROUP BY",
+	"DISTINCT ON",
+	"PARTITION BY"
+};
 
 static void extractRemainingColumns(List *common_colnames,
 						List *src_colnames, List *src_colvars,
@@ -76,6 +82,7 @@ static List *addTargetToSortList(ParseState *pstate, TargetEntry *tle,
 static List *addTargetToGroupList(ParseState *pstate, TargetEntry *tle,
 					 List *grouplist, List *targetlist, int location,
 					 bool resolveUnknown);
+static WindowClause *findWindowClause(List *wclist, const char *name);
 
 
 /*
@@ -555,15 +562,20 @@ transformRangeFunction(ParseState *pstate, RangeFunction *r)
 	 * Disallow aggregate functions in the expression.	(No reason to postpone
 	 * this check until parseCheckAggregates.)
 	 */
-	if (pstate->p_hasAggs)
-	{
-		if (checkExprHasAggs(funcexpr))
-			ereport(ERROR,
-					(errcode(ERRCODE_GROUPING_ERROR),
-					 errmsg("cannot use aggregate function in function expression in FROM"),
-					 parser_errposition(pstate,
-										locate_agg_of_level(funcexpr, 0))));
-	}
+	if (pstate->p_hasAggs &&
+		checkExprHasAggs(funcexpr))
+		ereport(ERROR,
+				(errcode(ERRCODE_GROUPING_ERROR),
+				 errmsg("cannot use aggregate function in function expression in FROM"),
+				 parser_errposition(pstate,
+									locate_agg_of_level(funcexpr, 0))));
+	if (pstate->p_hasWindowFuncs &&
+		checkExprHasWindowFuncs(funcexpr))
+		ereport(ERROR,
+				(errcode(ERRCODE_WINDOWING_ERROR),
+				 errmsg("cannot use window function in function expression in FROM"),
+				 parser_errposition(pstate,
+									locate_windowfunc(funcexpr))));
 
 	/*
 	 * OK, build an RTE for the function.
@@ -1156,16 +1168,28 @@ transformLimitClause(ParseState *pstate, Node *clause,
 				 parser_errposition(pstate,
 									locate_var_of_level(qual, 0))));
 	}
-	if (checkExprHasAggs(qual))
+	if (pstate->p_hasAggs &&
+		checkExprHasAggs(qual))
 	{
 		ereport(ERROR,
 				(errcode(ERRCODE_GROUPING_ERROR),
 		/* translator: %s is name of a SQL construct, eg LIMIT */
-				 errmsg("argument of %s must not contain aggregates",
+				 errmsg("argument of %s must not contain aggregate functions",
 						constructName),
 				 parser_errposition(pstate,
 									locate_agg_of_level(qual, 0))));
 	}
+	if (pstate->p_hasWindowFuncs &&
+		checkExprHasWindowFuncs(qual))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_WINDOWING_ERROR),
+		/* translator: %s is name of a SQL construct, eg LIMIT */
+				 errmsg("argument of %s must not contain window functions",
+						constructName),
+				 parser_errposition(pstate,
+									locate_windowfunc(qual))));
+	}
 
 	return qual;
 }
@@ -1234,7 +1258,7 @@ findTargetlistEntry(ParseState *pstate, Node *node, List **tlist, int clause)
 		char	   *name = strVal(linitial(((ColumnRef *) node)->fields));
 		int			location = ((ColumnRef *) node)->location;
 
-		if (clause == GROUP_CLAUSE)
+		if (clause == GROUP_CLAUSE || clause == PARTITION_CLAUSE)
 		{
 			/*
 			 * In GROUP BY, we must prefer a match against a FROM-clause
@@ -1251,6 +1275,8 @@ findTargetlistEntry(ParseState *pstate, Node *node, List **tlist, int clause)
 			 * SQL99 do not allow GROUPing BY an outer reference, so this
 			 * breaks no cases that are legal per spec, and it seems a more
 			 * self-consistent behavior.
+			 *
+			 * Window PARTITION BY clauses should act exactly like GROUP BY.
 			 */
 			if (colNameToVar(pstate, name, true, location) != NULL)
 				name = NULL;
@@ -1356,12 +1382,17 @@ findTargetlistEntry(ParseState *pstate, Node *node, List **tlist, int clause)
  *
  * GROUP BY items will be added to the targetlist (as resjunk columns)
  * if not already present, so the targetlist must be passed by reference.
+ *
+ * This is also used for window PARTITION BY clauses (which actually act
+ * just the same, except for the clause name used in error messages).
  */
 List *
 transformGroupClause(ParseState *pstate, List *grouplist,
-					 List **targetlist, List *sortClause)
+					 List **targetlist, List *sortClause,
+					 bool isPartition)
 {
 	List	   *result = NIL;
+	int			clause = isPartition ? PARTITION_CLAUSE : GROUP_CLAUSE;
 	ListCell   *gl;
 
 	foreach(gl, grouplist)
@@ -1370,8 +1401,7 @@ transformGroupClause(ParseState *pstate, List *grouplist,
 		TargetEntry *tle;
 		bool		found = false;
 
-		tle = findTargetlistEntry(pstate, gexpr,
-								  targetlist, GROUP_CLAUSE);
+		tle = findTargetlistEntry(pstate, gexpr, targetlist, clause);
 
 		/* Eliminate duplicates (GROUP BY x, x) */
 		if (targetIsInSortList(tle, InvalidOid, result))
@@ -1452,6 +1482,125 @@ transformSortClause(ParseState *pstate,
 }
 
 /*
+ * transformWindowDefinitions -
+ *		transform window definitions (WindowDef to WindowClause)
+ */
+List *
+transformWindowDefinitions(ParseState *pstate,
+						   List *windowdefs,
+						   List **targetlist)
+{
+	List	   *result = NIL;
+	Index		winref = 0;
+	ListCell   *lc;
+
+	foreach(lc, windowdefs)
+	{
+		WindowDef	 *windef = (WindowDef *) lfirst(lc);
+		WindowClause *refwc = NULL;
+		List		 *partitionClause;
+		List		 *orderClause;
+		WindowClause *wc;
+
+		winref++;
+
+		/*
+		 * Check for duplicate window names.
+		 */
+		if (windef->name &&
+			findWindowClause(result, windef->name) != NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_WINDOWING_ERROR),
+					 errmsg("window \"%s\" is already defined", windef->name),
+					 parser_errposition(pstate, windef->location)));
+
+		/*
+		 * If it references a previous window, look that up.
+		 */
+		if (windef->refname)
+		{
+			refwc = findWindowClause(result, windef->refname);
+			if (refwc == NULL)
+				ereport(ERROR,
+						(errcode(ERRCODE_UNDEFINED_OBJECT),
+						 errmsg("window \"%s\" does not exist",
+								windef->refname),
+						 parser_errposition(pstate, windef->location)));
+		}
+
+		/*
+		 * Transform PARTITION and ORDER specs, if any.  These are treated
+		 * exactly like top-level GROUP BY and ORDER BY clauses, including
+		 * the special handling of nondefault operator semantics.
+		 */
+		orderClause = transformSortClause(pstate,
+										  windef->orderClause,
+										  targetlist,
author	Tom Lane	2008-12-28 18:54:01 +0000
committer	Tom Lane	2008-12-28 18:54:01 +0000
commit	95b07bc7f5010233f52f9d11da74e2e5b653b0a7 (patch)
tree	48f5858bf4eca1bfb316ef02bb959ca85f568e0a
parent	38e9348282e9d078487147ba8a85aebec54e3a08 (diff)