-
Notifications
You must be signed in to change notification settings - Fork 159
/
Copy pathgce.go
511 lines (470 loc) · 14.8 KB
/
gce.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package buildlet
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net"
"os"
"os/exec"
"regexp"
"sort"
"strings"
"sync"
"time"
"golang.org/x/build/buildenv"
"golang.org/x/build/dashboard"
"golang.org/x/oauth2"
"golang.org/x/oauth2/google"
"google.golang.org/api/compute/v1"
)
// GCEGate optionally specifies a function to run before any GCE API call.
// It's intended to be used to bound QPS rate to GCE.
var GCEGate func()
func apiGate() {
if GCEGate != nil {
GCEGate()
}
}
// ErrQuotaExceeded matches errors.Is when VM creation fails with a
// quota error. Currently, it only supports GCE quota errors.
var ErrQuotaExceeded = errors.New("quota exceeded")
type GCEError struct {
OpErrors []*compute.OperationErrorErrors
}
func (q *GCEError) Error() string {
var buf bytes.Buffer
fmt.Fprintf(&buf, "%d GCE operation errors: ", len(q.OpErrors))
for i, e := range q.OpErrors {
if i != 0 {
buf.WriteString("; ")
}
b, err := json.Marshal(e)
if err != nil {
fmt.Fprintf(&buf, "json.Marshal(OpErrors[%d]): %v", i, err)
continue
}
buf.Write(b)
}
return buf.String()
}
func (q *GCEError) Is(target error) bool {
for _, err := range q.OpErrors {
if target == ErrQuotaExceeded && err.Code == "QUOTA_EXCEEDED" {
return true
}
}
return false
}
// StartNewVM boots a new VM on GCE and returns a buildlet client
// configured to speak to it.
func StartNewVM(creds *google.Credentials, buildEnv *buildenv.Environment, instName, hostType string, opts VMOpts) (Client, error) {
ctx := context.TODO()
computeService, _ := compute.New(oauth2.NewClient(ctx, creds.TokenSource))
if opts.Description == "" {
opts.Description = fmt.Sprintf("Go Builder for %s", hostType)
}
if opts.ProjectID == "" {
opts.ProjectID = buildEnv.ProjectName
}
if opts.Zone == "" {
opts.Zone = buildEnv.RandomVMZone()
}
zone := opts.Zone
if opts.DeleteIn == 0 {
opts.DeleteIn = 30 * time.Minute
}
hconf, ok := dashboard.Hosts[hostType]
if !ok {
return nil, fmt.Errorf("invalid host type %q", hostType)
}
if !hconf.IsVM() && !hconf.IsContainer() {
return nil, fmt.Errorf("host %q is type %q; want either a VM or container type", hostType, hconf.PoolName())
}
projectID := opts.ProjectID
if projectID == "" {
return nil, errors.New("buildlet: missing required ProjectID option")
}
prefix := "https://www.googleapis.com/compute/v1/projects/" + projectID
machType := prefix + "/zones/" + zone + "/machineTypes/" + hconf.MachineType()
diskType := "https://www.googleapis.com/compute/v1/projects/" + projectID + "/zones/" + zone + "/diskTypes/pd-ssd"
if hconf.RegularDisk {
diskType = "" // a spinning disk
}
srcImage := "https://www.googleapis.com/compute/v1/projects/" + projectID + "/global/images/" + hconf.VMImage
minCPU := hconf.MinCPUPlatform
if hconf.IsContainer() {
if hconf.NestedVirt {
minCPU = "Intel Cascade Lake" // n2 vms (which support NestedVirtualization) are either Ice Lake or Cascade Lake.
}
if vm := hconf.ContainerVMImage(); vm != "" {
srcImage = "https://www.googleapis.com/compute/v1/projects/" + projectID + "/global/images/" + vm
} else {
var err error
srcImage, err = cosImage(ctx, computeService, hconf.CosArchitecture())
if err != nil {
return nil, fmt.Errorf("error find Container-Optimized OS image: %v", err)
}
}
}
instance := &compute.Instance{
Name: instName,
Description: opts.Description,
MachineType: machType,
MinCpuPlatform: minCPU,
Disks: []*compute.AttachedDisk{
{
AutoDelete: true,
Boot: true,
Type: "PERSISTENT",
InitializeParams: &compute.AttachedDiskInitializeParams{
DiskName: instName,
SourceImage: srcImage,
DiskType: diskType,
DiskSizeGb: opts.DiskSizeGB,
},
},
},
Tags: &compute.Tags{
// Warning: do NOT list "http-server" or "allow-ssh" (our
// project's custom tag to allow ssh access) here; the
// buildlet provides full remote code execution.
// The https-server is authenticated, though.
Items: []string{"https-server"},
},
Metadata: &compute.Metadata{},
NetworkInterfaces: []*compute.NetworkInterface{{
Network: prefix + "/global/networks/default-vpc",
}},
// Prior to git rev 1b1e086fd, we used preemptible
// instances, as we were helping test the feature. It was
// removed after git rev a23395d because we hadn't been
// using it for some time. Our VMs are so short-lived that
// the feature doesn't really help anyway. But if we ever
// find we want it again, this comment is here to point to
// code that might be useful to partially resurrect.
Scheduling: &compute.Scheduling{Preemptible: false},
}
// Container builders use the COS image, which defaults to logging to Cloud Logging.
// Permission is granted to this service account.
if hconf.IsContainer() && buildEnv.COSServiceAccount != "" {
instance.ServiceAccounts = []*compute.ServiceAccount{
{
Email: buildEnv.COSServiceAccount,
Scopes: []string{compute.CloudPlatformScope},
},
}
}
addMeta := func(key, value string) {
instance.Metadata.Items = append(instance.Metadata.Items, &compute.MetadataItems{
Key: key,
Value: &value,
})
}
// The buildlet-binary-url is the URL of the buildlet binary
// which the VMs are configured to download at boot and run.
// This lets us/ update the buildlet more easily than
// rebuilding the whole VM image.
addMeta("buildlet-binary-url", hconf.BuildletBinaryURL(buildenv.ByProjectID(opts.ProjectID)))
addMeta("buildlet-host-type", hostType)
if !opts.TLS.IsZero() {
addMeta("tls-cert", opts.TLS.CertPEM)
addMeta("tls-key", opts.TLS.KeyPEM)
addMeta("password", opts.TLS.Password())
}
if hconf.IsContainer() && hconf.CosArchitecture() == dashboard.CosArchAMD64 {
addMeta("gce-container-declaration", fmt.Sprintf(`spec:
containers:
- name: buildlet
image: 'gcr.io/%s/%s'
volumeMounts:
- name: tmpfs-0
mountPath: /workdir
securityContext:
privileged: true
stdin: false
tty: false
restartPolicy: Always
volumes:
- name: tmpfs-0
emptyDir:
medium: Memory
`, opts.ProjectID, hconf.ContainerImage))
addMeta("user-data", `#cloud-config
runcmd:
- sysctl -w kernel.core_pattern=core
`)
} else if hconf.IsContainer() && hconf.CosArchitecture() == dashboard.CosArchARM64 {
addMeta("user-data", fmt.Sprintf(`#cloud-config
write_files:
- path: /etc/systemd/system/buildlet.service
permissions: 0644
owner: root:root
content: |
[Unit]
Description=Start buildlet container
Wants=gcr-online.target
After=gcr-online.target
[Service]
Environment="HOME=/home/buildlet"
ExecStart=/usr/bin/docker run --rm --name=buildlet --privileged -p 80:80 gcr.io/%s/%s
ExecStop=/usr/bin/docker stop buildlet
ExecStopPost=/usr/bin/docker rm buildlet
RemainAfterExit=true
Type=oneshot
runcmd:
- systemctl daemon-reload
- systemctl start buildlet.service
- sysctl -w kernel.core_pattern=core
`, opts.ProjectID, hconf.ContainerImage))
}
if opts.DeleteIn > 0 {
// In case the VM gets away from us (generally: if the
// coordinator dies while a build is running), then we
// set this attribute of when it should be killed so
// we can kill it later when the coordinator is
// restarted. The cleanUpOldVMs goroutine loop handles
// that killing.
addMeta("delete-at", fmt.Sprint(time.Now().Add(opts.DeleteIn).Unix()))
}
for k, v := range opts.Meta {
addMeta(k, v)
}
apiGate()
op, err := computeService.Instances.Insert(projectID, zone, instance).Do()
if err != nil {
return nil, fmt.Errorf("Failed to create instance: %v", err)
}
condRun(opts.OnInstanceRequested)
createOp := op.Name
// Wait for instance create operation to succeed.
OpLoop:
for {
time.Sleep(2 * time.Second)
apiGate()
op, err := computeService.ZoneOperations.Get(projectID, zone, createOp).Do()
if err != nil {
return nil, fmt.Errorf("failed to get op %s: %v", createOp, err)
}
switch op.Status {
case "PENDING", "RUNNING":
continue
case "DONE":
if op.Error != nil {
err := &GCEError{OpErrors: make([]*compute.OperationErrorErrors, len(op.Error.Errors))}
copy(err.OpErrors, op.Error.Errors)
return nil, err
}
break OpLoop
default:
return nil, fmt.Errorf("unknown create status %q: %+v", op.Status, op)
}
}
condRun(opts.OnInstanceCreated)
apiGate()
inst, err := computeService.Instances.Get(projectID, zone, instName).Do()
if err != nil {
return nil, fmt.Errorf("Error getting instance %s details after creation: %v", instName, err)
}
// Finds its internal and/or external IP addresses.
intIP, extIP := instanceIPs(inst)
// Wait for it to boot and its buildlet to come up.
var buildletURL string
var ipPort string
if !opts.TLS.IsZero() {
if extIP == "" {
return nil, errors.New("didn't find its external IP address")
}
buildletURL = "https://" + extIP
ipPort = extIP + ":443"
} else {
if intIP == "" {
return nil, errors.New("didn't find its internal IP address")
}
buildletURL = "http://" + intIP
ipPort = intIP + ":80"
}
if opts.OnGotInstanceInfo != nil {
opts.OnGotInstanceInfo(inst)
}
var closeFunc func()
if opts.UseIAPTunnel {
var localPort string
var err error
localPort, closeFunc, err = createIAPTunnel(ctx, inst)
if err != nil {
return nil, fmt.Errorf("creating IAP tunnel: %v", err)
}
buildletURL = "http://localhost:" + localPort
ipPort = "127.0.0.1:" + localPort
}
client, err := buildletClient(ctx, buildletURL, ipPort, &opts)
if err != nil {
return nil, err
}
if closeFunc != nil {
return &extraCloseClient{client, closeFunc}, nil
}
return client, nil
}
type extraCloseClient struct {
Client
close func()
}
func (e *extraCloseClient) Close() error {
defer e.close()
return e.Close()
}
func createIAPTunnel(ctx context.Context, inst *compute.Instance) (string, func(), error) {
// Allocate a local listening port.
ln, err := net.Listen("tcp", "localhost:0")
if err != nil {
return "", nil, err
}
localAddr := ln.Addr().(*net.TCPAddr)
ln.Close()
// Start the gcloud command. For some reason, when gcloud is run with a
// pipe for stdout, it doesn't log the success message, so we can only
// check for success empirically.
m := regexp.MustCompile(`/projects/([^/]+)/zones/([^/]+)`).FindStringSubmatch(inst.Zone)
if m == nil {
return "", nil, fmt.Errorf("unexpected inst.Zone: %q", inst.Zone)
}
project, zone := m[1], m[2]
tunnelCmd := exec.CommandContext(ctx,
"gcloud", "compute", "start-iap-tunnel", "--iap-tunnel-disable-connection-check",
"--project", project, "--zone", zone, inst.Name, "80", "--local-host-port", localAddr.String())
// hideWriter hides the underlying io.Writer from os/exec, bypassing the
// special case where os/exec will let a subprocess share the fd to an
// *os.File. Using hideWriter will result in goroutines that copy from a
// fresh pipe and write to the writer in the parent Go program.
// That guarantees that if the subprocess
// leaves background processes lying around, they will not keep lingering
// references to the parent Go program's stdout and stderr.
//
// Prior to this, it was common for ./debugnewvm | cat to never finish,
// because debugnewvm left some gcloud helper processes behind, and cat
// (or any other program) would never observe EOF on its input pipe.
// We now try to shut gcloud down more carefully with os.Interrupt below,
// but hideWriter guarantees that lingering processes won't hang
// pipelines.
type hideWriter struct{ io.Writer }
tunnelCmd.Stderr = hideWriter{os.Stderr}
tunnelCmd.Stdout = hideWriter{os.Stdout}
if err := tunnelCmd.Start(); err != nil {
return "", nil, err
}
// Start the process. Either it's going to fail to start after a bit, or
// it'll start listening on its port. Because we told it not to check the
// connection above, the connections won't be functional, but we can dial.
errc := make(chan error, 1)
go func() { errc <- tunnelCmd.Wait() }()
for start := time.Now(); time.Since(start) < 60*time.Second; time.Sleep(5 * time.Second) {
// Check if the server crashed.
select {
case err := <-errc:
return "", nil, err
default:
}
// Check if it's healthy.
conn, err := net.DialTCP("tcp", nil, localAddr)
if err == nil {
conn.Close()
kill := func() {
// gcloud compute start-iap-tunnel is a group of Python processes,
// so send an interrupt to try for an orderly shutdown of the process tree
// before killing the process outright.
tunnelCmd.Process.Signal(os.Interrupt)
time.Sleep(2 * time.Second)
tunnelCmd.Process.Kill()
}
return fmt.Sprint(localAddr.Port), kill, nil
}
}
return "", nil, fmt.Errorf("iap tunnel startup timed out")
}
type VM struct {
// Name is the name of the GCE VM instance.
// For example, it's of the form "mote-bradfitz-plan9-386-foo",
// and not "plan9-386-foo".
Name string
IPPort string
TLS KeyPair
Type string // buildlet type
}
func instanceIPs(inst *compute.Instance) (intIP, extIP string) {
for _, iface := range inst.NetworkInterfaces {
if strings.HasPrefix(iface.NetworkIP, "10.") {
intIP = iface.NetworkIP
}
for _, accessConfig := range iface.AccessConfigs {
if accessConfig.Type == "ONE_TO_ONE_NAT" {
extIP = accessConfig.NatIP
}
}
}
return
}
var (
cosListMu sync.Mutex
cosCachedTime time.Time
cosCache = map[dashboard.CosArch]*cosCacheEntry{}
)
type cosCacheEntry struct {
cachedTime time.Time
cachedImage string
}
// cosImage returns the GCP VM image name of the latest stable
// Container-Optimized OS image. It caches results for 15 minutes.
func cosImage(ctx context.Context, svc *compute.Service, arch dashboard.CosArch) (string, error) {
const cacheDuration = 15 * time.Minute
cosListMu.Lock()
defer cosListMu.Unlock()
cosQuery := func(a dashboard.CosArch) (string, error) {
imList, err := svc.Images.List("cos-cloud").Filter(fmt.Sprintf("(family eq %q)", string(arch))).Context(ctx).Do()
if err != nil {
return "", err
}
if imList.NextPageToken != "" {
return "", fmt.Errorf("too many images; pagination not supported")
}
ims := imList.Items
if len(ims) == 0 {
return "", errors.New("no image found")
}
sort.Slice(ims, func(i, j int) bool {
if ims[i].Deprecated == nil && ims[j].Deprecated != nil {
return true
}
return ims[i].CreationTimestamp > ims[j].CreationTimestamp
})
return ims[0].SelfLink, nil
}
c, ok := cosCache[arch]
if !ok {
image, err := cosQuery(arch)
if err != nil {
return "", err
}
cosCache[arch] = &cosCacheEntry{
cachedTime: time.Now(),
cachedImage: image,
}
return image, nil
}
if c.cachedImage != "" && c.cachedTime.After(time.Now().Add(-cacheDuration)) {
return c.cachedImage, nil
}
image, err := cosQuery(arch)
if err != nil {
return "", err
}
c.cachedImage = image
c.cachedTime = time.Now()
return image, nil
}