Skip to content

Commit d3c8d26

Browse files
gburdCommitfest Bot
authored andcommitted
Track buffer usage per-backend and use that to inform buffer managment
Implement a comprehensive buffer pressure monitoring and management system to improve PostgreSQL's buffer replacement efficiency under high load. Problems: - Buffer pressure was only detectable reactively during allocation failures - No visibility into which backends contribute most to buffer contention - High usage counts force multiple clock-sweep passes under heavy load - bgwriter had limited intelligence about system-wide buffer usage patterns Solution: - Add per-backend buffer usage counters - Implement proactive buffer pressure calculation in bgwriter - Add targeted buffer writing for high-usage backends (90th percentile) - Adjust rate of usage count reduction when pressure exceeds 75% threshold Benefits: - Reduces buffer allocation stalls by proactively managing pressure - Provides fair resource management by targeting high-usage backends - Improves system responsiveness under memory pressure - Maintains backward compatibility with existing buffer management - Enables better observability of buffer usage patterns per backend
1 parent e3bd1e5 commit d3c8d26

File tree

8 files changed

+420
-4
lines changed

8 files changed

+420
-4
lines changed

src/backend/postmaster/bgwriter.c

Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,283 @@ int BgWriterDelay = 200;
7777
static TimestampTz last_snapshot_ts;
7878
static XLogRecPtr last_snapshot_lsn = InvalidXLogRecPtr;
7979

80+
/*
81+
* Collected buffer usage information.
82+
*/
83+
typedef struct BackendBufferStats
84+
{
85+
int backend_id;
86+
uint64 usage_sum;
87+
double usage_ratio;
88+
} BackendBufferStats;
89+
90+
static int
91+
compare_backend_usage(const void *a, const void *b)
92+
{
93+
const BackendBufferStats *stat_a = (const BackendBufferStats *) a;
94+
const BackendBufferStats *stat_b = (const BackendBufferStats *) b;
95+
96+
if (stat_a->usage_ratio < stat_b->usage_ratio)
97+
return -1;
98+
if (stat_a->usage_ratio > stat_b->usage_ratio)
99+
return 1;
100+
return 0;
101+
}
102+
103+
static uint64
104+
CalculateSystemBufferPressure(BackendBufferStats *backend_stats[], int *num_backends)
105+
{
106+
uint64 total_usage = 0;
107+
int active_backends = 0;
108+
BackendBufferStats *stats;
109+
110+
/* Count active backends first */
111+
for (int i = 0; i < ProcGlobal->allProcCount; i++)
112+
{
113+
PGPROC *proc = &ProcGlobal->allProcs[i];
114+
115+
if (proc->pid != 0 && proc->databaseId != InvalidOid)
116+
active_backends++;
117+
}
118+
119+
if (active_backends == 0)
120+
{
121+
*backend_stats = NULL;
122+
*num_backends = 0;
123+
return 0;
124+
}
125+
126+
/* Allocate stats array */
127+
stats = palloc(sizeof(BackendBufferStats) * active_backends);
128+
*backend_stats = stats;
129+
*num_backends = active_backends;
130+
131+
/* Collect stats from all active backends */
132+
for (int i = 0, j = 0; i < ProcGlobal->allProcCount; i++)
133+
{
134+
PGPROC *proc = &ProcGlobal->allProcs[i];
135+
136+
if (proc->pid != 0 && proc->databaseId != InvalidOid)
137+
{
138+
uint64 usage_sum = pg_atomic_read_u32(&proc->bufferUsageSum);
139+
140+
stats[j].backend_id = i;
141+
stats[j].usage_sum = usage_sum;
142+
stats[j].usage_ratio = (double) usage_sum / NBuffers;
143+
total_usage += usage_sum;
144+
j++;
145+
}
146+
}
147+
148+
/* Sort by usage ratio for percentile calculation */
149+
qsort(stats, active_backends, sizeof(BackendBufferStats),
150+
compare_backend_usage);
151+
152+
return total_usage;
153+
}
154+
155+
static void
156+
GetHighUsageBackends(BackendBufferStats *stats, int num_backends,
157+
int **high_usage_backends, int *num_high_usage)
158+
{
159+
int percentile_90_idx = (int) (num_backends * 0.9);
160+
161+
*num_high_usage = num_backends - percentile_90_idx;
162+
163+
if (*num_high_usage > 0)
164+
{
165+
*high_usage_backends = palloc(sizeof(int) * (*num_high_usage));
166+
for (int i = 0; i < *num_high_usage; i++)
167+
(*high_usage_backends)[i] = stats[percentile_90_idx + i].backend_id;
168+
}
169+
else
170+
{
171+
*high_usage_backends = NULL;
172+
*num_high_usage = 0;
173+
}
174+
}
175+
176+
/*
177+
* Shared buffer sync function used by both main loop and aggressive writing
178+
*/
179+
static int
180+
SyncTargetedBuffers(WritebackContext *wb_context, int *target_backends,
181+
int num_targets, int max_buffers)
182+
{
183+
int buffers_written = 0;
184+
int buffer_id;
185+
BufferDesc *bufHdr;
186+
uint32 buf_state;
187+
188+
/* If no specific targets, sync any dirty buffers */
189+
if (target_backends == NULL || num_targets == 0)
190+
return BgBufferSync(wb_context);
191+
192+
/* Scan through all buffers looking for dirty ones from target backends */
193+
for (buffer_id = 0; buffer_id < NBuffers && buffers_written < max_buffers; buffer_id++)
194+
{
195+
uint32 dirty_backend;
196+
bool is_target;
197+
198+
bufHdr = GetBufferDescriptor(buffer_id);
199+
200+
/* Quick check if buffer is dirty */
201+
buf_state = pg_atomic_read_u32(&bufHdr->state);
202+
if (!(buf_state & BM_DIRTY))
203+
continue;
204+
205+
/* Check if this buffer is from one of our target backends */
206+
dirty_backend = pg_atomic_read_u32(&bufHdr->dirty_backend_id);
207+
is_target = false;
208+
209+
for (int i = 0; i < num_targets; i++)
210+
if (dirty_backend == target_backends[i])
211+
{
212+
is_target = true;
213+
break;
214+
}
215+
216+
if (!is_target)
217+
continue;
218+
219+
/* Skip if buffer is pinned */
220+
if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
221+
continue;
222+
223+
/* Try to write this buffer using the writeback context */
224+
ScheduleBufferTagForWriteback(wb_context,
225+
IOContextForStrategy(NULL),
226+
&bufHdr->tag);
227+
buffers_written++;
228+
}
229+
230+
/* Issue the actual writes */
231+
if (buffers_written > 0)
232+
IssuePendingWritebacks(wb_context, IOContextForStrategy(NULL));
233+
234+
return buffers_written;
235+
}
236+
237+
static void
238+
AggressiveBufferWrite(WritebackContext *wb_context, int *high_usage_backends,
239+
int num_high_usage, bool critical)
240+
{
241+
int write_target = critical ? bgwriter_lru_maxpages * 3 : bgwriter_lru_maxpages * 2;
242+
int buffers_written = 0;
243+
244+
/* Focus on buffers from high-usage backends first */
245+
buffers_written = SyncTargetedBuffers(wb_context, high_usage_backends,
246+
num_high_usage, write_target);
247+
248+
/* If still under target, write additional dirty buffers */
249+
if (buffers_written < write_target)
250+
BgBufferSync(wb_context);
251+
}
252+
253+
/* In src/backend/postmaster/bgwriter.c - Enhanced UpdateBackendDecayRates */
254+
static void
255+
UpdateBackendDecayRates(BackendBufferStats *backend_stats, int num_backends,
256+
double pressure_ratio, int *high_usage_backends, int num_high_usage)
257+
{
258+
uint32 base_decay_rate;
259+
uint64 total_usage = 0;
260+
uint64 avg_usage;
261+
int i,
262+
j;
263+
264+
/* Calculate base decay rate from system pressure */
265+
if (pressure_ratio > 0.90)
266+
/* Critical pressure - aggressive decay */
267+
base_decay_rate = 3;
268+
else if (pressure_ratio > 0.75)
269+
/* High pressure */
270+
base_decay_rate = 2;
271+
else
272+
/* Normal decay rate */
273+
base_decay_rate = 1;
274+
275+
/* Calculate total usage for relative comparisons */
276+
for (i = 0; i < num_backends; i++)
277+
total_usage += backend_stats[i].usage_sum;
278+
avg_usage = num_backends > 0 ? total_usage / num_backends : 0;
279+
280+
if (base_decay_rate > 1)
281+
elog(DEBUG2, "Buffer pressure: %.2f%%, base decay rate: %u, avg usage: %lu",
282+
pressure_ratio * 100, base_decay_rate, avg_usage);
283+
284+
/* Update each backend's personalized decay rate */
285+
for (i = 0; i < ProcGlobal->allProcCount; i++)
286+
{
287+
PGPROC *proc = &ProcGlobal->allProcs[i];
288+
289+
/* Only update active user backends */
290+
if (proc->pid != 0 && proc->databaseId != InvalidOid)
291+
{
292+
uint32 backend_usage = pg_atomic_read_u32(&proc->bufferUsageSum);
293+
uint32 personalized_rate = base_decay_rate;
294+
295+
/* Find this backend in the stats array */
296+
BackendBufferStats *backend_stat = NULL;
297+
298+
for (j = 0; j < num_backends; j++)
299+
{
300+
if (backend_stats[j].backend_id == i)
301+
{
302+
backend_stat = &backend_stats[j];
303+
break;
304+
}
305+
}
306+
307+
/*
308+
* Calculate personalized decay rate based on usage and
309+
* clock-sweep performance.
310+
*/
311+
if (backend_stat != NULL && avg_usage > 0)
312+
{
313+
double usage_ratio = (double) backend_usage / avg_usage;
314+
315+
/* Get clock-sweep performance metrics */
316+
uint32 search_count = pg_atomic_read_u32(&proc->bufferSearchCount);
317+
uint64 total_distance = pg_atomic_read_u64(&proc->clockSweepDistance);
318+
uint32 total_passes = pg_atomic_read_u32(&proc->clockSweepPasses);
319+
uint64 total_time = pg_atomic_read_u64(&proc->clockSweepTimeMicros);
320+
321+
/* Calculate average search metrics */
322+
double avg_distance = search_count > 0 ? (double) total_distance / search_count : 0;
323+
double avg_passes = search_count > 0 ? (double) total_passes / search_count : 0;
324+
double avg_time = search_count > 0 ? (double) total_time / search_count : 0;
325+
326+
/* Adjust decay rate based on usage relative to average */
327+
if (usage_ratio > 2.0)
328+
/* High usage backends get more aggressive decay */
329+
personalized_rate = Min(4, base_decay_rate + 2);
330+
else if (usage_ratio > 1.5)
331+
personalized_rate = Min(4, base_decay_rate + 1);
332+
else if (usage_ratio < 0.5)
333+
/* Low usage backends get less aggressive decay */
334+
personalized_rate = Max(1, base_decay_rate > 1 ? base_decay_rate - 1 : 1);
335+
336+
/* Further adjust based on clock-sweep performance */
337+
if (avg_distance > NBuffers * 0.5)
338+
/* Searching more than half the buffer pool */
339+
personalized_rate = Min(4, personalized_rate + 1);
340+
if (avg_passes > 1.0)
341+
/* Making multiple complete passes */
342+
personalized_rate = Min(4, personalized_rate + 1);
343+
if (avg_time > 1000.0)
344+
/* Taking more than 1ms per search */
345+
personalized_rate = Min(4, personalized_rate + 1);
346+
347+
elog(DEBUG2, "Backend %d: usage_ratio=%.2f, avg_distance=%.1f, avg_passes=%.2f, "
348+
"avg_time=%.1fμs, decay_rate=%u",
349+
i, usage_ratio, avg_distance, avg_passes, avg_time, personalized_rate);
350+
}
351+
352+
/* Update the backend's decay rate */
353+
pg_atomic_write_u32(&proc->bufferDecayRate, personalized_rate);
354+
}
355+
}
356+
}
80357

81358
/*
82359
* Main entry point for bgwriter process
@@ -222,6 +499,15 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len)
222499
*/
223500
for (;;)
224501
{
502+
BackendBufferStats *backend_stats = NULL;
503+
int num_backends;
504+
int *high_usage_backends = NULL;
505+
int num_high_usage;
506+
uint64 max_possible;
507+
uint64 total_usage;
508+
double pressure_ratio;
509+
bool high_pressure;
510+
bool critical_pressure;
225511
bool can_hibernate;
226512
int rc;
227513

@@ -230,6 +516,35 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len)
230516

231517
ProcessMainLoopInterrupts();
232518

519+
/* Calculate current buffer pressure */
520+
total_usage = CalculateSystemBufferPressure(&backend_stats, &num_backends);
521+
max_possible = (uint64) NBuffers * BM_MAX_USAGE_COUNT;
522+
total_usage = total_usage > max_possible ? max_possible : total_usage;
523+
pressure_ratio = (double) total_usage / max_possible;
524+
525+
/* Get high-usage backends (90th percentile) */
526+
if (backend_stats != NULL)
527+
GetHighUsageBackends(backend_stats, num_backends,
528+
&high_usage_backends, &num_high_usage);
529+
530+
/* Update global decay rate based on current pressure */
531+
UpdateBackendDecayRates(backend_stats, num_backends, pressure_ratio,
532+
high_usage_backends, num_high_usage);
533+
534+
/* Determine if proactive action is needed */
535+
high_pressure = pressure_ratio > 0.75; /* 75% threshold */
536+
critical_pressure = pressure_ratio > 0.90; /* 90% threshold */
537+
538+
if (high_pressure)
539+
{
540+
elog(LOG, "%s buffer pressure detected: %.2f%% (%d high-usage backends)",
541+
critical_pressure ? "Critical" : "High",
542+
pressure_ratio * 100, num_high_usage);
543+
544+
/* Aggressive writing of dirty buffers */
545+
AggressiveBufferWrite(&wb_context, high_usage_backends, num_high_usage, critical_pressure);
546+
}
547+
233548
/*
234549
* Do one cycle of dirty-buffer writing.
235550
*/
@@ -294,6 +609,11 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len)
294609
}
295610
}
296611

612+
if (backend_stats != NULL)
613+
pfree(backend_stats);
614+
if (high_usage_backends != NULL)
615+
pfree(high_usage_backends);
616+
297617
/*
298618
* Sleep until we are signaled or BgWriterDelay has elapsed.
299619
*

src/backend/storage/buffer/buf_init.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,9 @@ BufferManagerShmemInit(void)
124124
pg_atomic_init_u32(&buf->state, 0);
125125
buf->wait_backend_pgprocno = INVALID_PROC_NUMBER;
126126

127+
/* Initialize dirty backend tracking */
128+
pg_atomic_init_u32(&buf->dirty_backend_id, INVALID_PROC_NUMBER);
129+
127130
buf->buf_id = i;
128131

129132
pgaio_wref_clear(&buf->io_wref);

src/backend/storage/buffer/bufmgr.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2136,6 +2136,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
21362136
* just like permanent relations.
21372137
*/
21382138
victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2139+
if (MyProc != NULL)
2140+
pg_atomic_add_fetch_u32(&MyProc->bufferUsageSum, 1);
21392141
if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
21402142
victim_buf_state |= BM_PERMANENT;
21412143

@@ -2781,6 +2783,8 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
27812783
victim_buf_hdr->tag = tag;
27822784

27832785
buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2786+
if (MyProc != NULL)
2787+
pg_atomic_add_fetch_u32(&MyProc->bufferUsageSum, 1);
27842788
if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
27852789
buf_state |= BM_PERMANENT;
27862790

@@ -2950,6 +2954,11 @@ MarkBufferDirty(Buffer buffer)
29502954
Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
29512955
buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
29522956

2957+
/* Track which backend dirtied this buffer */
2958+
if (MyProc != NULL)
2959+
pg_atomic_write_u32(&bufHdr->dirty_backend_id,
2960+
MyProc - ProcGlobal->allProcs);
2961+
29532962
if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
29542963
buf_state))
29552964
break;

0 commit comments

Comments
 (0)