Skip to content
This repository has been archived by the owner on Aug 23, 2023. It is now read-only.

Commit

Permalink
fix those damn sporadically false positive usage tests
Browse files Browse the repository at this point in the history
after lots of experimentation I figured out the mock clock sometimes
simply doesn't properly trigger, so that in Usage.Report() sometimes
nothing is received on the tick channel, despite advancing the fake
clock by more than strictly nessecary (i tried with an extra ms),
despite calling runtime.Goshed() ourselves, and despite sleeping
20 ms with the real clock.

The author of the clock package confirms that due to the way the
runtime schedules goroutines, there's no way around the fake clock
sometimes not working. See
https://gophers.slack.com/archives/general/p1462238960008162

Furthermore, in discussion with the golang developers at
golang/go#8869 it becomes clear that it's
unlikely that we'll have a fakeable clock anytime soon.

Ben Johnson (clock author) suggests in the above mentioned gophers
thread that we could mock out the tick function and pass in a different
function in tests.  However, that changes so much of the time logic
that it becomes pointless to do any time-based testing in this design.

We could also switch to simply test the basics, not time based.
Since the timing code is pretty easy.

However before we go that route, I wanted to try working with the real
clock.  Basically run the usage reporting in real time, but scaled down
to millisecond level instead of second level, to make it finish fairly
quickly.

So now some semantics are changing a bit:
* we allow up to <period> ms for the usage report to be in the state we
need it
* so we now works with steps, which don't happen at exact predictable
  timestamps, rather they have to happen within a timeframe
* checking timestamp would have gotten more complicated, so I just
removed it.  It's easy to reason that if the updates come within the
alotted times, then the timestamps should also be set correctly.
* there's no serious need to explicitly pass around interval settings
  anymore, we just use 1 everywhere.

If it turns out that this approach also triggers false positives
(for example due to circleCI machines being maxed out of CPU and the
reporting unable to happen within the needed time) then we can address
as needed and still switch to the simpler approach.
But that seems very unlikely.  This should work.
  • Loading branch information
Dieterbe committed Oct 13, 2016
1 parent 5201b69 commit 7f66846
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 89 deletions.
3 changes: 1 addition & 2 deletions metrictank.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import (

"github.com/Dieterbe/profiletrigger/heap"
"github.com/Shopify/sarama"
"github.com/benbjohnson/clock"
"github.com/raintank/dur"
"github.com/raintank/met"
"github.com/raintank/met/helper"
Expand Down Expand Up @@ -360,7 +359,7 @@ func main() {

log.Info("metricIndex initialized in %s. starting data consumption", time.Now().Sub(pre))

usg := usage.New(accountingPeriod, metrics, metricIndex, clock.New())
usg := usage.New(time.Second*time.Duration(accountingPeriod), int(accountingPeriod), metrics, metricIndex)

handlers := make([]mdata.ClusterHandler, 0)
if clNSQ.Enabled {
Expand Down
41 changes: 23 additions & 18 deletions usage/usage.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@
package usage

import (
"github.com/benbjohnson/clock"
"sync"
"time"

"github.com/raintank/metrictank/idx"
"github.com/raintank/metrictank/mdata"
"gopkg.in/raintank/schema.v1"
"sync"
"time"
)

var Clock clock.Clock
var metrics mdata.Metrics
var metricIndex idx.MetricIndex

Expand All @@ -23,20 +22,27 @@ type orgstat struct {
// tracks for every org
type Usage struct {
sync.Mutex
period uint32
now map[int]orgstat
prev map[int]orgstat
stop chan struct{}
period time.Duration
seconds int
now map[int]orgstat
prev map[int]orgstat
stop chan struct{}
}

func New(period uint32, m mdata.Metrics, i idx.MetricIndex, cl clock.Clock) *Usage {
// New creates a new Usage reporter, reporting every period
// secs controls the second-level interval of the output metrics
// it's up to the caller to set this in accordance to period
// we could deduce this from period using int(period.Seconds()) but I'm
// concerned of float-to-int conversion rounding errors
// for unit testing, the value of secs doesn't really matter
func New(period time.Duration, secs int, m mdata.Metrics, i idx.MetricIndex) *Usage {
metrics = m
metricIndex = i
Clock = cl
ret := &Usage{
period: period,
now: make(map[int]orgstat),
stop: make(chan struct{}),
period: period,
seconds: secs,
now: make(map[int]orgstat),
stop: make(chan struct{}),
}
go ret.Report()
return ret
Expand Down Expand Up @@ -74,22 +80,21 @@ func (u *Usage) set(org int, key string, points uint32) {
}

func (u *Usage) Report() {
period := time.Duration(u.period) * time.Second
// provides "clean" ticks at precise intervals, and delivers them shortly after
tick := func() chan time.Time {
now := Clock.Now()
now := time.Now()
nowUnix := now.UnixNano()
diff := period - (time.Duration(nowUnix) % period)
diff := u.period - (time.Duration(nowUnix) % u.period)
ideal := now.Add(diff)
ch := make(chan time.Time)
go func() {
Clock.Sleep(diff)
time.Sleep(diff)
ch <- ideal
}()
return ch
}
met := schema.MetricData{
Interval: int(u.period),
Interval: u.seconds,
Tags: []string{},
}

Expand Down
186 changes: 117 additions & 69 deletions usage/usage_test.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
package usage

import (
"errors"
"fmt"
"runtime"
"sync"
"testing"
"time"

"github.com/benbjohnson/clock"
"github.com/raintank/met/helper"
"github.com/raintank/metrictank/consolidation"
"github.com/raintank/metrictank/idx/memory"
Expand All @@ -25,20 +27,19 @@ func NewFakeAggMetrics() *FakeAggMetrics {
}
}

// assumes you're holding the lock
func (f *FakeAggMetrics) Get(key string) (mdata.Metric, bool) {
f.Lock()
m, ok := f.Metrics[key]
f.Unlock()
return m, ok
}

// assumes you're holding the lock
func (f *FakeAggMetrics) GetOrCreate(key string) mdata.Metric {
f.Lock()
m, ok := f.Metrics[key]
if !ok {
m = &FakeAggMetric{key, 0, 0}
f.Metrics[key] = m
}
f.Unlock()
return m
}

Expand All @@ -61,142 +62,189 @@ func (f *FakeAggMetric) GetAggregated(consolidator consolidation.Consolidator, a
return 0, make([]iter.Iter, 0)
}

func idFor(org int, metric, unit, mtype string, tags []string, interval uint32) string {
func idFor(org int, metric, unit, mtype string, tags []string) string {
md := schema.MetricData{
OrgId: org,
Metric: metric,
Unit: unit,
Mtype: mtype,
Tags: tags,
Interval: int(interval),
Interval: 1,
}
md.SetId()
return md.Id
}

// wait executes fn which can do various assertions and panics when things aren't right
// it will recover the panic, and keep retrying up to every millisecond up to max milliseconds.
// if the error keeps happening until after the deadline, it reports it as a test failure.
func wait(max int, aggmetrics *FakeAggMetrics, t *testing.T, fn func(aggmetrics *FakeAggMetrics)) {
execute := func() (err error) {
defer func(errp *error) {
e := recover()
if e != nil {
if _, ok := e.(runtime.Error); ok {
panic(e)
}
if err, ok := e.(error); ok {
*errp = err
} else if errStr, ok := e.(string); ok {
*errp = errors.New(errStr)
} else {
*errp = fmt.Errorf("%v", e)
}
}
return
}(&err)
aggmetrics.Lock()
defer aggmetrics.Unlock()
fn(aggmetrics)
return err
}
var err error
for i := 1; i <= max; i++ {
err = execute()
if err == nil {
break
}
//fmt.Printf("sleeping %d/%d\n", i, max)
time.Sleep(time.Millisecond)
}
if err != nil {
t.Fatalf("waited %d ms, then: %s", max, err)
}
}

// set t to nil to trigger a panic, useful for inside wait()
func assertLen(epoch int, aggmetrics *FakeAggMetrics, l int, t *testing.T) {
aggmetrics.Lock()
if len(aggmetrics.Metrics) != l {
t.Fatalf("%d seconds in: there should be %d metrics at this point, not %d", epoch, l, len(aggmetrics.Metrics))
e := fmt.Sprintf("%d ms in: there should be %d metrics at this point, not %d", epoch, l, len(aggmetrics.Metrics))
if t != nil {
t.Fatal(e)
} else {
panic(e)
}
}
aggmetrics.Unlock()
}
func assert(interval uint32, epoch int, aggmetrics *FakeAggMetrics, org int, metric, unit, mtype string, ts uint32, val float64, t *testing.T) {
id := idFor(org, metric, unit, mtype, []string{}, interval)

// set t to nil to trigger a panic, useful for inside wait()
func assert(step int, aggmetrics *FakeAggMetrics, org int, metric, unit, mtype string, val float64, t *testing.T) {
id := idFor(org, metric, unit, mtype, []string{})
m, ok := aggmetrics.Get(id)
if !ok {
t.Fatalf("%d seconds in: assert org %d metric %s ts %d val %f -> metric not found", epoch, org, metric, ts, val)
e := fmt.Sprintf("step %d: assert org %d metric %s val %f -> metric not found", step, org, metric, val)
if t != nil {
t.Fatal(e)
} else {
panic(e)
}
}
n := m.(*FakeAggMetric)
if n.lastVal != val || n.lastTs != ts {
t.Fatalf("%d seconds in: assert org %d metric %s ts %d val %f -> got ts %d val %f", epoch, org, metric, ts, val, n.lastTs, n.lastVal)
if n.lastVal != val {
e := fmt.Sprintf("step %d: assert org %d metric %s val %f -> got val %f", step, org, metric, val, n.lastTs, n.lastVal)
if t != nil {
t.Fatal(e)
} else {
panic(e)
}
}
}

func TestUsageBasic(t *testing.T) {
mock := clock.NewMock()
aggmetrics := NewFakeAggMetrics()
stats, _ := helper.New(false, "", "standard", "metrictank", "")
mdata.InitMetrics(stats)
metricIndex := memory.New()
metricIndex.Init(stats)
interval := uint32(60)
u := New(interval, aggmetrics, metricIndex, mock)
interval := 60
u := New(time.Duration(interval)*time.Millisecond, 1, aggmetrics, metricIndex)

assertLen(0, aggmetrics, 0, t)

u.Add(1, "foo")
u.Add(1, "bar")
u.Add(2, "foo")
mock.Add(30 * time.Second)
assertLen(30, aggmetrics, 0, t)
mock.Add(29 * time.Second)
assertLen(59, aggmetrics, 0, t)

u.Add(2, "foo")
mock.Add(time.Second)
assertLen(60, aggmetrics, 4, t)
assert(interval, 60, aggmetrics, 1, "metrictank.usage.numSeries", "serie", "gauge", 60, 2, t)
assert(interval, 60, aggmetrics, 1, "metrictank.usage.numPoints", "point", "counter", 60, 2, t)
assert(interval, 60, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 60, 1, t)
assert(interval, 60, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 60, 2, t)
wait(60, aggmetrics, t, func(aggmetrics *FakeAggMetrics) {
assertLen(1, aggmetrics, 4, nil)
assert(1, aggmetrics, 1, "metrictank.usage.numSeries", "serie", "gauge", 2, nil)
assert(1, aggmetrics, 1, "metrictank.usage.numPoints", "point", "counter", 2, nil)
assert(1, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 1, nil)
assert(1, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 2, nil)
})

u.Add(1, "foo")
u.Add(2, "foo")
mock.Add(60 * time.Second)

assert(interval, 120, aggmetrics, 1, "metrictank.usage.numSeries", "serie", "gauge", 120, 1, t)
assert(interval, 120, aggmetrics, 1, "metrictank.usage.numPoints", "point", "counter", 120, 3, t)
assert(interval, 120, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 120, 1, t)
assert(interval, 120, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 120, 3, t)
u.Add(3, "foo")
wait(60, aggmetrics, t, func(aggmetrics *FakeAggMetrics) {
assertLen(2, aggmetrics, 6, nil)
assert(2, aggmetrics, 1, "metrictank.usage.numSeries", "serie", "gauge", 1, nil)
assert(2, aggmetrics, 1, "metrictank.usage.numPoints", "point", "counter", 3, nil)
assert(2, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 1, nil)
assert(2, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 3, nil)
assert(2, aggmetrics, 3, "metrictank.usage.numSeries", "serie", "gauge", 1, nil)
assert(2, aggmetrics, 3, "metrictank.usage.numPoints", "point", "counter", 1, nil)
})

u.Stop()
}
func TestUsageMinusOne(t *testing.T) {
mock := clock.NewMock()

func testUsageMinusOne(t *testing.T) {
aggmetrics := NewFakeAggMetrics()
stats, _ := helper.New(false, "", "standard", "metrictank", "")
mdata.InitMetrics(stats)
metricIndex := memory.New()
metricIndex.Init(stats)
interval := uint32(60)
u := New(interval, aggmetrics, metricIndex, mock)
interval := 60
u := New(time.Duration(interval)*time.Millisecond, 1, aggmetrics, metricIndex)

assertLen(0, aggmetrics, 0, t)

u.Add(-1, "globally-visible") // but usage only reported to org 1
u.Add(1, "foo")
u.Add(2, "bar")
mock.Add(30 * time.Second)
assertLen(30, aggmetrics, 0, t)
mock.Add(29 * time.Second)
assertLen(59, aggmetrics, 0, t)
mock.Add(time.Second)
// not very pretty.. but an easy way to assure that the Usage Reporter
// goroutine has some time to push the results into our fake aggmetrics
time.Sleep(20 * time.Millisecond)
assertLen(60, aggmetrics, 6, t)
assert(interval, 60, aggmetrics, 1, "metrictank.usage-minus1.numSeries", "serie", "gauge", 60, 1, t)
assert(interval, 60, aggmetrics, 1, "metrictank.usage-minus1.numPoints", "point", "counter", 60, 1, t)
assert(interval, 60, aggmetrics, 1, "metrictank.usage.numSeries", "serie", "gauge", 60, 1, t)
assert(interval, 60, aggmetrics, 1, "metrictank.usage.numPoints", "point", "counter", 60, 1, t)
assert(interval, 60, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 60, 1, t)
assert(interval, 60, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 60, 1, t)
wait(60, aggmetrics, t, func(aggmetrics *FakeAggMetrics) {
assertLen(1, aggmetrics, 6, nil)
assert(1, aggmetrics, 1, "metrictank.usage-minus1.numSeries", "serie", "gauge", 1, nil)
assert(1, aggmetrics, 1, "metrictank.usage-minus1.numPoints", "point", "counter", 1, nil)
assert(1, aggmetrics, 1, "metrictank.usage.numSeries", "serie", "gauge", 1, nil)
assert(1, aggmetrics, 1, "metrictank.usage.numPoints", "point", "counter", 1, nil)
assert(1, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 1, nil)
assert(1, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 1, nil)
})

u.Stop()
}

func TestUsageWrap32(t *testing.T) {
mock := clock.NewMock()
aggmetrics := NewFakeAggMetrics()
stats, _ := helper.New(false, "", "standard", "metrictank", "")
mdata.InitMetrics(stats)
metricIndex := memory.New()
metricIndex.Init(stats)
interval := uint32(60)
u := New(interval, aggmetrics, metricIndex, mock)
interval := 60
u := New(time.Duration(interval)*time.Millisecond, 1, aggmetrics, metricIndex)

// max uint32 is 4294967295, let's verify the proper wrapping around that
// pretend an insert maxuint32 -900000

assertLen(0, aggmetrics, 0, t)
u.Add(2, "foo")
u.set(2, "foo", 4294067295)
mock.Add(30 * time.Second)
assertLen(30, aggmetrics, 0, t)
mock.Add(29 * time.Second)
assertLen(59, aggmetrics, 0, t)
mock.Add(time.Second)
assertLen(60, aggmetrics, 2, t)
assert(interval, 60, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 60, 1, t)
assert(interval, 60, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 60, 4294067295, t)
wait(60, aggmetrics, t, func(aggmetrics *FakeAggMetrics) {
assertLen(1, aggmetrics, 2, nil)
assert(1, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 1, t)
assert(1, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 4294067295, t)
})

for i := 0; i < 1000001; i++ {
u.Add(2, "foo")
}
mock.Add(60 * time.Second)
assertLen(120, aggmetrics, 2, t)
assert(interval, 120, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 120, 1, t)
assert(interval, 120, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 120, 100000, t)
wait(60, aggmetrics, t, func(aggmetrics *FakeAggMetrics) {
assertLen(2, aggmetrics, 2, nil)
assert(2, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 1, nil)
assert(2, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 100000, nil)
})

u.Stop()
}

0 comments on commit 7f66846

Please sign in to comment.