From acabf68e066e0fa20721606716de33b3066db2e3 Mon Sep 17 00:00:00 2001 From: Lantao Liu Date: Mon, 20 Jun 2016 16:33:54 -0700 Subject: [PATCH 1/4] Add README.md for kernel monitor --- pkg/kernelmonitor/README.md | 56 +++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 pkg/kernelmonitor/README.md diff --git a/pkg/kernelmonitor/README.md b/pkg/kernelmonitor/README.md new file mode 100644 index 000000000..329a5d348 --- /dev/null +++ b/pkg/kernelmonitor/README.md @@ -0,0 +1,56 @@ +# Kernel Monitor + +*Kernel Monitor* is a problem daemon in node problem detector. It monitors kernel log +and detects known kernel issues following predefined rules. + +The Kernel Monitor matches kernel issues according to a set of predefined rule list in +[`config/kernel-monitor.json`](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json). +The rule list is extensible. + +## Limitations + +* Kernel Monitor only supports file based kernel log now. It doesn't support log tools +like journald. There is an [open issue](https://github.com/kubernetes/node-problem-detector/issues/14) +to add journald support. + +* Kernel Monitor has assumption on kernel log format, now it only works on Ubuntu and +Debian. However, it is easy to extend it to [support other log format](#support-other-log-format). + +## Add New NodeConditions + +To support new node conditions, you can extend the `conditions` field in +`config/kernel-monitor.json` with new condition definition: + +```json +{ + "type": "NodeConditionType", + "reason": "CamelCaseDefaultNodeConditionReason", + "message": "arbitrary default node condition message" +} +``` + +## Detect New Problems + +To detect new problems, you can extend the `rules` field in `config/kernel-monitor.json` +with new rule definition: + +```json +{ + "type": "temporary/permanent", + "condition": "NodeConditionOfPermanentIssue", + "reason": "CamelCaseShortReason", + "message": "regexp matching the issue in the kernel log" +} +``` + +## Change Log Path + +Kernel log in different OS distros may locate in different path. The `log` +field in `config/kernel-monitor.json` is the log path inside the container. +You can always configure it to match your OS distro. + +## Support Other Log Format + +Kernel monitor uses [`Translator`](https://github.com/kubernetes/node-problem-detector/blob/master/pkg/kernelmonitor/translator/translator.go) +plugin to translate kernel log the internal data structure. It is easy to +implement a new translator for a new log format. From 5a19ac1868db5612f989cfa3bf6fa9e0cb3edcca Mon Sep 17 00:00:00 2001 From: Lantao Liu Date: Thu, 11 Aug 2016 12:04:05 -0700 Subject: [PATCH 2/4] Get node name from pod, this makes sure that the node name should always be consistent with kubelet. --- Makefile | 2 +- README.md | 12 ++++++++++-- node-problem-detector.yaml | 12 ++++++++++-- pkg/problemclient/problem_client.go | 8 ++++++-- 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index e11f2ee96..65557e6d0 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: push # See pod.yaml for the version currently running-- bump this ahead before rebuilding! -TAG = v0.1 +TAG = v0.2 PROJ = google_containers diff --git a/README.md b/README.md index c769c0d16..ccafba4ab 100644 --- a/README.md +++ b/README.md @@ -66,13 +66,21 @@ metadata: spec: template: spec: - hostNetwork: true containers: - name: node-problem-detector - image: gcr.io/google_containers/node-problem-detector:v0.1 + image: gcr.io/google_containers/node-problem-detector:v0.2 imagePullPolicy: Always securityContext: privileged: true + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace volumeMounts: - name: log mountPath: /log diff --git a/node-problem-detector.yaml b/node-problem-detector.yaml index 117c0257d..cc762e299 100644 --- a/node-problem-detector.yaml +++ b/node-problem-detector.yaml @@ -8,16 +8,24 @@ spec: labels: app: node-problem-detector spec: - hostNetwork: true containers: - name: node-problem-detector command: - /node-problem-detector - --kernel-monitor=/config/kernel-monitor.json - image: gcr.io/google_containers/node-problem-detector:v0.1 + image: gcr.io/google.com/noogler-kubernetes/node-problem-detector:v0.2 imagePullPolicy: Always securityContext: privileged: true + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace volumeMounts: - name: log mountPath: /log diff --git a/pkg/problemclient/problem_client.go b/pkg/problemclient/problem_client.go index 7c97ba32a..2f5ce537f 100644 --- a/pkg/problemclient/problem_client.go +++ b/pkg/problemclient/problem_client.go @@ -57,11 +57,15 @@ func NewClientOrDie() Client { } // TODO(random-liu): Set QPS Limit c.client = client.NewOrDie(cfg) - // TODO(random-liu): Get node name from cloud provider - c.nodeName, err = os.Hostname() + // Get node name from the current pod. + pod, err := c.client.Pods(os.Getenv("POD_NAMESPACE")).Get(os.Getenv("POD_NAME")) if err != nil { panic(err) } + if pod.Spec.NodeName == "" { + panic("empty node name") + } + c.nodeName = pod.Spec.NodeName c.nodeRef = getNodeRef(c.nodeName) c.recorders = make(map[string]record.EventRecorder) return c From 09af299a886e38ccb771177b2744d4f9524596e9 Mon Sep 17 00:00:00 2001 From: Lantao Liu Date: Thu, 11 Aug 2016 17:03:02 -0700 Subject: [PATCH 3/4] Change the wrong image path in pod yaml. Change the wrong image path of pod yaml. --- node-problem-detector.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-problem-detector.yaml b/node-problem-detector.yaml index cc762e299..e0eaf3e50 100644 --- a/node-problem-detector.yaml +++ b/node-problem-detector.yaml @@ -13,7 +13,7 @@ spec: command: - /node-problem-detector - --kernel-monitor=/config/kernel-monitor.json - image: gcr.io/google.com/noogler-kubernetes/node-problem-detector:v0.2 + image: gcr.io/google_containers/node-problem-detector:v0.2 imagePullPolicy: Always securityContext: privileged: true From 9054dab4c80c0aa7d0fa799f3000b249efd24d7a Mon Sep 17 00:00:00 2001 From: Lantao Liu Date: Sat, 20 Aug 2016 19:00:26 -0700 Subject: [PATCH 4/4] Get node name from the downward api. --- Makefile | 2 +- README.md | 8 ++------ node-problem-detector.yaml | 8 ++------ pkg/problemclient/problem_client.go | 24 ++++++++++++++++-------- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 65557e6d0..d8ad299d9 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ all: push -# See pod.yaml for the version currently running-- bump this ahead before rebuilding! +# See node-problem-detector.yaml for the version currently running-- bump this ahead before rebuilding! TAG = v0.2 PROJ = google_containers diff --git a/README.md b/README.md index ccafba4ab..ff8406837 100644 --- a/README.md +++ b/README.md @@ -73,14 +73,10 @@ spec: securityContext: privileged: true env: - - name: POD_NAME + - name: NODE_NAME valueFrom: fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace + fieldPath: spec.nodeName volumeMounts: - name: log mountPath: /log diff --git a/node-problem-detector.yaml b/node-problem-detector.yaml index e0eaf3e50..149fb9ed6 100644 --- a/node-problem-detector.yaml +++ b/node-problem-detector.yaml @@ -18,14 +18,10 @@ spec: securityContext: privileged: true env: - - name: POD_NAME + - name: NODE_NAME valueFrom: fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace + fieldPath: spec.nodeName volumeMounts: - name: log mountPath: /log diff --git a/pkg/problemclient/problem_client.go b/pkg/problemclient/problem_client.go index 2f5ce537f..ebbedbb5a 100644 --- a/pkg/problemclient/problem_client.go +++ b/pkg/problemclient/problem_client.go @@ -57,15 +57,23 @@ func NewClientOrDie() Client { } // TODO(random-liu): Set QPS Limit c.client = client.NewOrDie(cfg) - // Get node name from the current pod. - pod, err := c.client.Pods(os.Getenv("POD_NAMESPACE")).Get(os.Getenv("POD_NAME")) - if err != nil { - panic(err) - } - if pod.Spec.NodeName == "" { - panic("empty node name") + // Get node name from environment variable NODE_NAME + // By default, assume that the NODE_NAME env should have been set with + // downward api. We prefer it because sometimes the hostname returned + // by os.Hostname is not right because: + // 1. User may override the hostname. + // 2. For some cloud providers, os.Hostname is different from the real hostname. + c.nodeName = os.Getenv("NODE_NAME") + if c.nodeName == "" { + // For backward compatibility. If the env is not set, get the hostname + // from os.Hostname(). This may not work for all configurations and + // environments. + var err error + c.nodeName, err = os.Hostname() + if err != nil { + panic("empty node name") + } } - c.nodeName = pod.Spec.NodeName c.nodeRef = getNodeRef(c.nodeName) c.recorders = make(map[string]record.EventRecorder) return c