diff --git a/Makefile b/Makefile index e11f2ee96..d8ad299d9 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: push -# See pod.yaml for the version currently running-- bump this ahead before rebuilding! -TAG = v0.1 +# See node-problem-detector.yaml for the version currently running-- bump this ahead before rebuilding! +TAG = v0.2 PROJ = google_containers diff --git a/README.md b/README.md index c769c0d16..ff8406837 100644 --- a/README.md +++ b/README.md @@ -66,13 +66,17 @@ metadata: spec: template: spec: - hostNetwork: true containers: - name: node-problem-detector - image: gcr.io/google_containers/node-problem-detector:v0.1 + image: gcr.io/google_containers/node-problem-detector:v0.2 imagePullPolicy: Always securityContext: privileged: true + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName volumeMounts: - name: log mountPath: /log diff --git a/node-problem-detector.yaml b/node-problem-detector.yaml index 117c0257d..149fb9ed6 100644 --- a/node-problem-detector.yaml +++ b/node-problem-detector.yaml @@ -8,16 +8,20 @@ spec: labels: app: node-problem-detector spec: - hostNetwork: true containers: - name: node-problem-detector command: - /node-problem-detector - --kernel-monitor=/config/kernel-monitor.json - image: gcr.io/google_containers/node-problem-detector:v0.1 + image: gcr.io/google_containers/node-problem-detector:v0.2 imagePullPolicy: Always securityContext: privileged: true + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName volumeMounts: - name: log mountPath: /log diff --git a/pkg/kernelmonitor/README.md b/pkg/kernelmonitor/README.md new file mode 100644 index 000000000..329a5d348 --- /dev/null +++ b/pkg/kernelmonitor/README.md @@ -0,0 +1,56 @@ +# Kernel Monitor + +*Kernel Monitor* is a problem daemon in node problem detector. It monitors kernel log +and detects known kernel issues following predefined rules. + +The Kernel Monitor matches kernel issues according to a set of predefined rule list in +[`config/kernel-monitor.json`](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json). +The rule list is extensible. + +## Limitations + +* Kernel Monitor only supports file based kernel log now. It doesn't support log tools +like journald. There is an [open issue](https://github.com/kubernetes/node-problem-detector/issues/14) +to add journald support. + +* Kernel Monitor has assumption on kernel log format, now it only works on Ubuntu and +Debian. However, it is easy to extend it to [support other log format](#support-other-log-format). + +## Add New NodeConditions + +To support new node conditions, you can extend the `conditions` field in +`config/kernel-monitor.json` with new condition definition: + +```json +{ + "type": "NodeConditionType", + "reason": "CamelCaseDefaultNodeConditionReason", + "message": "arbitrary default node condition message" +} +``` + +## Detect New Problems + +To detect new problems, you can extend the `rules` field in `config/kernel-monitor.json` +with new rule definition: + +```json +{ + "type": "temporary/permanent", + "condition": "NodeConditionOfPermanentIssue", + "reason": "CamelCaseShortReason", + "message": "regexp matching the issue in the kernel log" +} +``` + +## Change Log Path + +Kernel log in different OS distros may locate in different path. The `log` +field in `config/kernel-monitor.json` is the log path inside the container. +You can always configure it to match your OS distro. + +## Support Other Log Format + +Kernel monitor uses [`Translator`](https://github.com/kubernetes/node-problem-detector/blob/master/pkg/kernelmonitor/translator/translator.go) +plugin to translate kernel log the internal data structure. It is easy to +implement a new translator for a new log format. diff --git a/pkg/problemclient/problem_client.go b/pkg/problemclient/problem_client.go index 7c97ba32a..ebbedbb5a 100644 --- a/pkg/problemclient/problem_client.go +++ b/pkg/problemclient/problem_client.go @@ -57,10 +57,22 @@ func NewClientOrDie() Client { } // TODO(random-liu): Set QPS Limit c.client = client.NewOrDie(cfg) - // TODO(random-liu): Get node name from cloud provider - c.nodeName, err = os.Hostname() - if err != nil { - panic(err) + // Get node name from environment variable NODE_NAME + // By default, assume that the NODE_NAME env should have been set with + // downward api. We prefer it because sometimes the hostname returned + // by os.Hostname is not right because: + // 1. User may override the hostname. + // 2. For some cloud providers, os.Hostname is different from the real hostname. + c.nodeName = os.Getenv("NODE_NAME") + if c.nodeName == "" { + // For backward compatibility. If the env is not set, get the hostname + // from os.Hostname(). This may not work for all configurations and + // environments. + var err error + c.nodeName, err = os.Hostname() + if err != nil { + panic("empty node name") + } } c.nodeRef = getNodeRef(c.nodeName) c.recorders = make(map[string]record.EventRecorder)