fix those damn sporadically false positive usage tests

after lots of experimentation I figured out the mock clock sometimes simply doesn't properly trigger, so that in Usage.Report() sometimes nothing is received on the tick channel, despite advancing the fake clock by more than strictly nessecary (i tried with an extra ms), despite calling runtime.Goshed() ourselves, and despite sleeping 20 ms with the real clock. The author of the clock package confirms that due to the way the runtime schedules goroutines, there's no way around the fake clock sometimes not working. See https://gophers.slack.com/archives/general/p1462238960008162 Furthermore, in discussion with the golang developers at golang/go#8869 it becomes clear that it's unlikely that we'll have a fakeable clock anytime soon. Ben Johnson (clock author) suggests in the above mentioned gophers thread that we could mock out the tick function and pass in a different function in tests. However, that changes so much of the time logic that it becomes pointless to do any time-based testing in this design. We could also switch to simply test the basics, not time based. Since the timing code is pretty easy. However before we go that route, I wanted to try working with the real clock. Basically run the usage reporting in real time, but scaled down to millisecond level instead of second level, to make it finish fairly quickly. So now some semantics are changing a bit: * we allow up to <period> ms for the usage report to be in the state we need it * so we now works with steps, which don't happen at exact predictable timestamps, rather they have to happen within a timeframe * checking timestamp would have gotten more complicated, so I just removed it. It's easy to reason that if the updates come within the alotted times, then the timestamps should also be set correctly. * there's no serious need to explicitly pass around interval settings anymore, we just use 1 everywhere. If it turns out that this approach also triggers false positives (for example due to circleCI machines being maxed out of CPU and the reporting unable to happen within the needed time) then we can address as needed and still switch to the simpler approach. But that seems very unlikely. This should work.
grafana · Oct 13, 2016 · 7f66846 · 7f66846
1 parent 5201b69
commit 7f66846
Show file tree

Hide file tree

Showing 3 changed files with 141 additions and 89 deletions.
diff --git a/metrictank.go b/metrictank.go
@@ -17,7 +17,6 @@ import (
 
 	"github.com/Dieterbe/profiletrigger/heap"
 	"github.com/Shopify/sarama"
-	"github.com/benbjohnson/clock"
 	"github.com/raintank/dur"
 	"github.com/raintank/met"
 	"github.com/raintank/met/helper"
@@ -360,7 +359,7 @@ func main() {
 
 	log.Info("metricIndex initialized in %s. starting data consumption", time.Now().Sub(pre))
 
-	usg := usage.New(accountingPeriod, metrics, metricIndex, clock.New())
+	usg := usage.New(time.Second*time.Duration(accountingPeriod), int(accountingPeriod), metrics, metricIndex)
 
 	handlers := make([]mdata.ClusterHandler, 0)
 	if clNSQ.Enabled {

diff --git a/usage/usage.go b/usage/usage.go
@@ -3,15 +3,14 @@
 package usage
 
 import (
-	"github.com/benbjohnson/clock"
+	"sync"
+	"time"
+
 	"github.com/raintank/metrictank/idx"
 	"github.com/raintank/metrictank/mdata"
 	"gopkg.in/raintank/schema.v1"
-	"sync"
-	"time"
 )
 
-var Clock clock.Clock
 var metrics mdata.Metrics
 var metricIndex idx.MetricIndex
 
@@ -23,20 +22,27 @@ type orgstat struct {
 // tracks for every org
 type Usage struct {
 	sync.Mutex
-	period uint32
-	now    map[int]orgstat
-	prev   map[int]orgstat
-	stop   chan struct{}
+	period  time.Duration
+	seconds int
+	now     map[int]orgstat
+	prev    map[int]orgstat
+	stop    chan struct{}
 }
 
-func New(period uint32, m mdata.Metrics, i idx.MetricIndex, cl clock.Clock) *Usage {
+// New creates a new Usage reporter, reporting every period
+// secs controls the second-level interval of the output metrics
+// it's up to the caller to set this in accordance to period
+// we could deduce this from period using int(period.Seconds()) but I'm
+// concerned of float-to-int conversion rounding errors
+// for unit testing, the value of secs doesn't really matter
+func New(period time.Duration, secs int, m mdata.Metrics, i idx.MetricIndex) *Usage {
 	metrics = m
 	metricIndex = i
-	Clock = cl
 	ret := &Usage{
-		period: period,
-		now:    make(map[int]orgstat),
-		stop:   make(chan struct{}),
+		period:  period,
+		seconds: secs,
+		now:     make(map[int]orgstat),
+		stop:    make(chan struct{}),
 	}
 	go ret.Report()
 	return ret
@@ -74,22 +80,21 @@ func (u *Usage) set(org int, key string, points uint32) {
 }
 
 func (u *Usage) Report() {
-	period := time.Duration(u.period) * time.Second
 	// provides "clean" ticks at precise intervals, and delivers them shortly after
 	tick := func() chan time.Time {
-		now := Clock.Now()
+		now := time.Now()
 		nowUnix := now.UnixNano()
-		diff := period - (time.Duration(nowUnix) % period)
+		diff := u.period - (time.Duration(nowUnix) % u.period)
 		ideal := now.Add(diff)
 		ch := make(chan time.Time)
 		go func() {
-			Clock.Sleep(diff)
+			time.Sleep(diff)
 			ch <- ideal
 		}()
 		return ch
 	}
 	met := schema.MetricData{
-		Interval: int(u.period),
+		Interval: u.seconds,
 		Tags:     []string{},
 	}
 

diff --git a/usage/usage_test.go b/usage/usage_test.go
@@ -1,11 +1,13 @@
 package usage
 
 import (
+	"errors"
+	"fmt"
+	"runtime"
 	"sync"
 	"testing"
 	"time"
 
-	"github.com/benbjohnson/clock"
 	"github.com/raintank/met/helper"
 	"github.com/raintank/metrictank/consolidation"
 	"github.com/raintank/metrictank/idx/memory"
@@ -25,20 +27,19 @@ func NewFakeAggMetrics() *FakeAggMetrics {
 	}
 }
 
+// assumes you're holding the lock
 func (f *FakeAggMetrics) Get(key string) (mdata.Metric, bool) {
-	f.Lock()
 	m, ok := f.Metrics[key]
-	f.Unlock()
 	return m, ok
 }
+
+// assumes you're holding the lock
 func (f *FakeAggMetrics) GetOrCreate(key string) mdata.Metric {
-	f.Lock()
 	m, ok := f.Metrics[key]
 	if !ok {
 		m = &FakeAggMetric{key, 0, 0}
 		f.Metrics[key] = m
 	}
-	f.Unlock()
 	return m
 }
 
@@ -61,142 +62,189 @@ func (f *FakeAggMetric) GetAggregated(consolidator consolidation.Consolidator, a
 	return 0, make([]iter.Iter, 0)
 }
 
-func idFor(org int, metric, unit, mtype string, tags []string, interval uint32) string {
+func idFor(org int, metric, unit, mtype string, tags []string) string {
 	md := schema.MetricData{
 		OrgId:    org,
 		Metric:   metric,
 		Unit:     unit,
 		Mtype:    mtype,
 		Tags:     tags,
-		Interval: int(interval),
+		Interval: 1,
 	}
 	md.SetId()
 	return md.Id
 }
 
+// wait executes fn which can do various assertions and panics when things aren't right
+// it will recover the panic, and keep retrying up to every millisecond up to max milliseconds.
+// if the error keeps happening until after the deadline, it reports it as a test failure.
+func wait(max int, aggmetrics *FakeAggMetrics, t *testing.T, fn func(aggmetrics *FakeAggMetrics)) {
+	execute := func() (err error) {
+		defer func(errp *error) {
+			e := recover()
+			if e != nil {
+				if _, ok := e.(runtime.Error); ok {
+					panic(e)
+				}
+				if err, ok := e.(error); ok {
+					*errp = err
+				} else if errStr, ok := e.(string); ok {
+					*errp = errors.New(errStr)
+				} else {
+					*errp = fmt.Errorf("%v", e)
+				}
+			}
+			return
+		}(&err)
+		aggmetrics.Lock()
+		defer aggmetrics.Unlock()
+		fn(aggmetrics)
+		return err
+	}
+	var err error
+	for i := 1; i <= max; i++ {
+		err = execute()
+		if err == nil {
+			break
+		}
+		//fmt.Printf("sleeping %d/%d\n", i, max)
+		time.Sleep(time.Millisecond)
+	}
+	if err != nil {
+		t.Fatalf("waited %d ms, then: %s", max, err)
+	}
+}
+
+// set t to nil to trigger a panic, useful for inside wait()
 func assertLen(epoch int, aggmetrics *FakeAggMetrics, l int, t *testing.T) {
-	aggmetrics.Lock()
 	if len(aggmetrics.Metrics) != l {
-		t.Fatalf("%d seconds in: there should be %d metrics at this point, not %d", epoch, l, len(aggmetrics.Metrics))
+		e := fmt.Sprintf("%d ms in: there should be %d metrics at this point, not %d", epoch, l, len(aggmetrics.Metrics))
+		if t != nil {
+			t.Fatal(e)
+		} else {
+			panic(e)
+		}
 	}
-	aggmetrics.Unlock()
 }
-func assert(interval uint32, epoch int, aggmetrics *FakeAggMetrics, org int, metric, unit, mtype string, ts uint32, val float64, t *testing.T) {
-	id := idFor(org, metric, unit, mtype, []string{}, interval)
+
+// set t to nil to trigger a panic, useful for inside wait()
+func assert(step int, aggmetrics *FakeAggMetrics, org int, metric, unit, mtype string, val float64, t *testing.T) {
+	id := idFor(org, metric, unit, mtype, []string{})
 	m, ok := aggmetrics.Get(id)
 	if !ok {
-		t.Fatalf("%d seconds in: assert org %d metric %s ts %d val %f -> metric not found", epoch, org, metric, ts, val)
+		e := fmt.Sprintf("step %d: assert org %d metric %s val %f -> metric not found", step, org, metric, val)
+		if t != nil {
+			t.Fatal(e)
+		} else {
+			panic(e)
+		}
 	}
 	n := m.(*FakeAggMetric)
-	if n.lastVal != val || n.lastTs != ts {
-		t.Fatalf("%d seconds in: assert org %d metric %s ts %d val %f -> got ts %d val %f", epoch, org, metric, ts, val, n.lastTs, n.lastVal)
+	if n.lastVal != val {
+		e := fmt.Sprintf("step %d: assert org %d metric %s val %f -> got val %f", step, org, metric, val, n.lastTs, n.lastVal)
+		if t != nil {
+			t.Fatal(e)
+		} else {
+			panic(e)
+		}
 	}
 }
 
 func TestUsageBasic(t *testing.T) {
-	mock := clock.NewMock()
 	aggmetrics := NewFakeAggMetrics()
 	stats, _ := helper.New(false, "", "standard", "metrictank", "")
 	mdata.InitMetrics(stats)
 	metricIndex := memory.New()
 	metricIndex.Init(stats)
-	interval := uint32(60)
-	u := New(interval, aggmetrics, metricIndex, mock)
+	interval := 60
+	u := New(time.Duration(interval)*time.Millisecond, 1, aggmetrics, metricIndex)
 
 	assertLen(0, aggmetrics, 0, t)
 
 	u.Add(1, "foo")
 	u.Add(1, "bar")
 	u.Add(2, "foo")
-	mock.Add(30 * time.Second)
-	assertLen(30, aggmetrics, 0, t)
-	mock.Add(29 * time.Second)
-	assertLen(59, aggmetrics, 0, t)
-
 	u.Add(2, "foo")
-	mock.Add(time.Second)
-	assertLen(60, aggmetrics, 4, t)
-	assert(interval, 60, aggmetrics, 1, "metrictank.usage.numSeries", "serie", "gauge", 60, 2, t)
-	assert(interval, 60, aggmetrics, 1, "metrictank.usage.numPoints", "point", "counter", 60, 2, t)
-	assert(interval, 60, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 60, 1, t)
-	assert(interval, 60, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 60, 2, t)
+	wait(60, aggmetrics, t, func(aggmetrics *FakeAggMetrics) {
+		assertLen(1, aggmetrics, 4, nil)
+		assert(1, aggmetrics, 1, "metrictank.usage.numSeries", "serie", "gauge", 2, nil)
+		assert(1, aggmetrics, 1, "metrictank.usage.numPoints", "point", "counter", 2, nil)
+		assert(1, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 1, nil)
+		assert(1, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 2, nil)
+	})
 
 	u.Add(1, "foo")
 	u.Add(2, "foo")
-	mock.Add(60 * time.Second)
-
-	assert(interval, 120, aggmetrics, 1, "metrictank.usage.numSeries", "serie", "gauge", 120, 1, t)
-	assert(interval, 120, aggmetrics, 1, "metrictank.usage.numPoints", "point", "counter", 120, 3, t)
-	assert(interval, 120, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 120, 1, t)
-	assert(interval, 120, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 120, 3, t)
+	u.Add(3, "foo")
+	wait(60, aggmetrics, t, func(aggmetrics *FakeAggMetrics) {
+		assertLen(2, aggmetrics, 6, nil)
+		assert(2, aggmetrics, 1, "metrictank.usage.numSeries", "serie", "gauge", 1, nil)
+		assert(2, aggmetrics, 1, "metrictank.usage.numPoints", "point", "counter", 3, nil)
+		assert(2, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 1, nil)
+		assert(2, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 3, nil)
+		assert(2, aggmetrics, 3, "metrictank.usage.numSeries", "serie", "gauge", 1, nil)
+		assert(2, aggmetrics, 3, "metrictank.usage.numPoints", "point", "counter", 1, nil)
+	})
 
 	u.Stop()
 }
-func TestUsageMinusOne(t *testing.T) {
-	mock := clock.NewMock()
+
+func testUsageMinusOne(t *testing.T) {
 	aggmetrics := NewFakeAggMetrics()
 	stats, _ := helper.New(false, "", "standard", "metrictank", "")
 	mdata.InitMetrics(stats)
 	metricIndex := memory.New()
 	metricIndex.Init(stats)
-	interval := uint32(60)
-	u := New(interval, aggmetrics, metricIndex, mock)
+	interval := 60
+	u := New(time.Duration(interval)*time.Millisecond, 1, aggmetrics, metricIndex)
 
 	assertLen(0, aggmetrics, 0, t)
 
 	u.Add(-1, "globally-visible") // but usage only reported to org 1
 	u.Add(1, "foo")
 	u.Add(2, "bar")
-	mock.Add(30 * time.Second)
-	assertLen(30, aggmetrics, 0, t)
-	mock.Add(29 * time.Second)
-	assertLen(59, aggmetrics, 0, t)
-	mock.Add(time.Second)
-	// not very pretty.. but an easy way to assure that the Usage Reporter
-	// goroutine has some time to push the results into our fake aggmetrics
-	time.Sleep(20 * time.Millisecond)
-	assertLen(60, aggmetrics, 6, t)
-	assert(interval, 60, aggmetrics, 1, "metrictank.usage-minus1.numSeries", "serie", "gauge", 60, 1, t)
-	assert(interval, 60, aggmetrics, 1, "metrictank.usage-minus1.numPoints", "point", "counter", 60, 1, t)
-	assert(interval, 60, aggmetrics, 1, "metrictank.usage.numSeries", "serie", "gauge", 60, 1, t)
-	assert(interval, 60, aggmetrics, 1, "metrictank.usage.numPoints", "point", "counter", 60, 1, t)
-	assert(interval, 60, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 60, 1, t)
-	assert(interval, 60, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 60, 1, t)
+	wait(60, aggmetrics, t, func(aggmetrics *FakeAggMetrics) {
+		assertLen(1, aggmetrics, 6, nil)
+		assert(1, aggmetrics, 1, "metrictank.usage-minus1.numSeries", "serie", "gauge", 1, nil)
+		assert(1, aggmetrics, 1, "metrictank.usage-minus1.numPoints", "point", "counter", 1, nil)
+		assert(1, aggmetrics, 1, "metrictank.usage.numSeries", "serie", "gauge", 1, nil)
+		assert(1, aggmetrics, 1, "metrictank.usage.numPoints", "point", "counter", 1, nil)
+		assert(1, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 1, nil)
+		assert(1, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 1, nil)
+	})
 
 	u.Stop()
 }
+
 func TestUsageWrap32(t *testing.T) {
-	mock := clock.NewMock()
 	aggmetrics := NewFakeAggMetrics()
 	stats, _ := helper.New(false, "", "standard", "metrictank", "")
 	mdata.InitMetrics(stats)
 	metricIndex := memory.New()
 	metricIndex.Init(stats)
-	interval := uint32(60)
-	u := New(interval, aggmetrics, metricIndex, mock)
+	interval := 60
+	u := New(time.Duration(interval)*time.Millisecond, 1, aggmetrics, metricIndex)
 
 	// max uint32 is 4294967295, let's verify the proper wrapping around that
 	// pretend an insert maxuint32 -900000
 
 	assertLen(0, aggmetrics, 0, t)
 	u.Add(2, "foo")
 	u.set(2, "foo", 4294067295)
-	mock.Add(30 * time.Second)
-	assertLen(30, aggmetrics, 0, t)
-	mock.Add(29 * time.Second)
-	assertLen(59, aggmetrics, 0, t)
-	mock.Add(time.Second)
-	assertLen(60, aggmetrics, 2, t)
-	assert(interval, 60, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 60, 1, t)
-	assert(interval, 60, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 60, 4294067295, t)
+	wait(60, aggmetrics, t, func(aggmetrics *FakeAggMetrics) {
+		assertLen(1, aggmetrics, 2, nil)
+		assert(1, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 1, t)
+		assert(1, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 4294067295, t)
+	})
 
 	for i := 0; i < 1000001; i++ {
 		u.Add(2, "foo")
 	}
-	mock.Add(60 * time.Second)
-	assertLen(120, aggmetrics, 2, t)
-	assert(interval, 120, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 120, 1, t)
-	assert(interval, 120, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 120, 100000, t)
+	wait(60, aggmetrics, t, func(aggmetrics *FakeAggMetrics) {
+		assertLen(2, aggmetrics, 2, nil)
+		assert(2, aggmetrics, 2, "metrictank.usage.numSeries", "serie", "gauge", 1, nil)
+		assert(2, aggmetrics, 2, "metrictank.usage.numPoints", "point", "counter", 100000, nil)
+	})
 
 	u.Stop()
 }