forked from elastic/built-docs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathperformance.html
223 lines (204 loc) · 18.6 KB
/
performance.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
<!DOCTYPE html>
<html lang="en-us">
<head>
<meta charset="UTF-8">
<meta name="description" content="Reference documentation of elasticsearch-hadoop">
<title>Performance considerations | Elasticsearch for Apache Hadoop [8.9] | Elastic</title>
<meta class="elastic" name="content" content="Performance considerations | Elasticsearch for Apache Hadoop [8.9]">
<link rel="home" href="index.html" title="Elasticsearch for Apache Hadoop [8.9]"/>
<link rel="up" href="reference.html" title="Elasticsearch for Apache Hadoop"/>
<link rel="prev" href="metrics.html" title="Hadoop Metrics"/>
<link rel="next" href="cloud.html" title="Cloud/restricted environments"/>
<meta class="elastic" name="product_version" content="8.9"/>
<meta class="elastic" name="product_name" content="Elasticsearch"/>
<meta class="elastic" name="website_area" content="documentation"/>
<meta name="DC.type" content="Learn/Docs/Elasticsearch/Apache Hadoop/8.9"/>
<meta name="DC.subject" content="Elasticsearch"/>
<meta name="DC.identifier" content="8.9"/>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<script src="https://cdn.optimizely.com/js/18132920325.js"></script>
<link rel="apple-touch-icon" sizes="57x57" href="/apple-icon-57x57.png">
<link rel="apple-touch-icon" sizes="60x60" href="/apple-icon-60x60.png">
<link rel="apple-touch-icon" sizes="72x72" href="/apple-icon-72x72.png">
<link rel="apple-touch-icon" sizes="76x76" href="/apple-icon-76x76.png">
<link rel="apple-touch-icon" sizes="114x114" href="/apple-icon-114x114.png">
<link rel="apple-touch-icon" sizes="120x120" href="/apple-icon-120x120.png">
<link rel="apple-touch-icon" sizes="144x144" href="/apple-icon-144x144.png">
<link rel="apple-touch-icon" sizes="152x152" href="/apple-icon-152x152.png">
<link rel="apple-touch-icon" sizes="180x180" href="/apple-icon-180x180.png">
<link rel="icon" type="image/png" href="/favicon-32x32.png" sizes="32x32">
<link rel="icon" type="image/png" href="/android-chrome-192x192.png" sizes="192x192">
<link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
<link rel="icon" type="image/png" href="/favicon-16x16.png" sizes="16x16">
<link rel="manifest" href="/manifest.json">
<meta name="apple-mobile-web-app-title" content="Elastic">
<meta name="application-name" content="Elastic">
<meta name="msapplication-TileColor" content="#ffffff">
<meta name="msapplication-TileImage" content="/mstile-144x144.png">
<meta name="theme-color" content="#ffffff">
<meta name="naver-site-verification" content="936882c1853b701b3cef3721758d80535413dbfd" />
<meta name="yandex-verification" content="d8a47e95d0972434" />
<meta name="localized" content="true" />
<meta name="st:robots" content="follow,index" />
<meta property="og:image" content="https://static-www.elastic.co/v3/assets/bltefdd0b53724fa2ce/blt280217a63b82a734/6202d3378b1f312528798412/elastic-logo.svg" />
<meta property="og:image:width" content="500" />
<meta property="og:image:height" content="172" />
<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
<link rel="icon" href="/favicon.ico" type="image/x-icon">
<link rel="apple-touch-icon-precomposed" sizes="64x64" href="/favicon_64x64_16bit.png">
<link rel="apple-touch-icon-precomposed" sizes="32x32" href="/favicon_32x32.png">
<link rel="apple-touch-icon-precomposed" sizes="16x16" href="/favicon_16x16.png">
<!-- Give IE8 a fighting chance -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
<link rel="stylesheet" type="text/css" href="/guide/static/styles.css" />
</head>
<!--© 2015-2022 Elasticsearch B.V. -->
<!-- All Elastic documentation is licensed under a Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License. -->
<!-- http://creativecommons.org/licenses/by-nc-nd/4.0/ -->
<body>
<!-- Google Tag Manager -->
<script>dataLayer = [];</script><noscript><iframe src="//www.googletagmanager.com/ns.html?id=GTM-58RLH5" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= '//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-58RLH5');</script>
<!-- End Google Tag Manager -->
<!-- Global site tag (gtag.js) - Google Analytics -->
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-12395217-16"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'UA-12395217-16');
</script>
<!-- Google Tag Manager for GA4 -->
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-KNJMG2M');</script>
<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KNJMG2M" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
<!-- End Google Tag Manager for GA4-->
<div id='elastic-nav' style="display:none;"></div>
<script src='https://www.elastic.co/elastic-nav.js'></script>
<div class="main-container">
<section id="content" >
<div class="content-wrapper">
<section id="guide" lang="en">
<div class="container-fluid">
<div class="row pb-3">
<div class="col-12 order-2 col-md-4 order-md-1 col-lg-3 h-almost-full-md sticky-top-md" id="left_col">
<!-- The TOC is appended here -->
</div>
<div class="col-12 order-1 col-md-8 order-md-2 col-lg-7 order-lg-2 guide-section" id="middle_col">
<!-- start body -->
<div id="content">
<div class="breadcrumbs">
<span class="breadcrumb-link"><a href="/guide/">Elastic Docs</a></span>
<span class="chevron-right">›</span><span class="breadcrumb-link"><a href="index.html">Elasticsearch for Apache Hadoop [8.9]</a></span>
<span class="chevron-right">›</span><span class="breadcrumb-link"><a href="reference.html">Elasticsearch for Apache Hadoop</a></span>
</div>
<div class="navheader">
<span class="prev">
<a href="metrics.html">« Hadoop Metrics</a>
</span>
<span class="next">
<a href="cloud.html">Cloud/restricted environments »</a>
</span>
</div>
<div class="chapter">
<div class="titlepage"><div><div>
<h2 class="title"><a id="performance"></a>Performance considerations<a class="edit_me" rel="nofollow" title="Edit this page on GitHub" href="https://github.com/elastic/elasticsearch-hadoop/edit/8.9/docs/src/reference/asciidoc/core/performance.adoc">edit</a></h2>
</div></div></div>
<p>As the connector itself does not abstract Elasticsearch nor is it stateful per se (does not keep any state between jobs and only sends writes in bulk - see below), when looking at performance, one should keep an eye on Elasticsearch itself and its behavior.</p>
<p>Do note that in general, we advise against changing too many settings especially on the Elasticsearch side simply because the defaults are good enough for most cases. With every release Elasticsearch becomes <em>smarter</em> about its environment and automatically adjusts itself (for example HDD vs SDD).</p>
<p>Unless you really <em>know</em> what you are doing and have actual results showing the improvements (aka real benchmarks) do <span class="strong strong"><strong>not</strong></span> make blind adjustments simply because there is a blog or post somewhere telling you to do so. Way too many times we have encountered systems that underperformed due to some misconfiguration that was supposed to do otherwise.</p>
<h3><a id="_always_aim_for_stability"></a>Always aim for stability<a class="edit_me" rel="nofollow" title="Edit this page on GitHub" href="https://github.com/elastic/elasticsearch-hadoop/edit/8.9/docs/src/reference/asciidoc/core/performance.adoc">edit</a></h3>
<p>Do note that the first and most important part of a system (even more so in a distributed one) is to have a stable, predictable environment as much as possible. While failure is a certainty (it is not a question of if but rather when), one should also keep in mind the stability of a system under pressure.
Be generous with your hardware and start with baby steps - if a system cannot perform adequate under small load, it will certainly lag behind under pressure.
Always stability should come first and performance second. One can’t have a performant system without it being stable.</p>
<p>Please keep this in mind and do not tweak settings hoping it would automatically increase performance.
That is why for reliable results, always measure to see whether the changes that you make, have the desired impact.</p>
<h3><a id="_read_performance"></a>Read performance<a class="edit_me" rel="nofollow" title="Edit this page on GitHub" href="https://github.com/elastic/elasticsearch-hadoop/edit/8.9/docs/src/reference/asciidoc/core/performance.adoc">edit</a></h3>
<p>elasticsearch-hadoop performs a parallel read spread across all the target resource shards. It relies on <a href="/guide/en/elasticsearch/guide/current/scroll.html" class="ulink" target="_top">scrolling</a> which is as efficient as it can be in returning the results of a query.
If the read performance is not optimal, first try to see how <em>expensive</em> the query is by interogating Elasticsearch directly. The Elasticsearch documentation is a good place to start, such as the <a href="/guide/en/elasticsearch/guide/current/_improving_performance.html" class="ulink" target="_top">Improving Performance</a> page; similar queries might return the same results but can have a different cost.</p>
<p>If a query is inefficient, try rewriting it in a more optimal way. Sometimes changing the data structure (for example by adding a new field or denormalizing it) can significantly improve the query performance as there are less calculations that need to be done.</p>
<p>Note that typically, the number of results do not influence the performance of the connector nor Elasticsearch itself. The connector does <span class="strong strong"><strong>not</strong></span> read all the results at once but rather in chunks and passes them onwards to the consumer.
Thus typically the size becomes an issue on the consumer side depending on whether it tries to buffer the results from the connector or can simply iterate or stream through them.</p>
<p>A common concern is whether increasing the number of shards improves read performance. And it does if that means scaling out, that is spread the data better across multiple nodes (which ideally sit on multiple machines). If that is not the case, the shards end up sitting on the same node and thus on the same machine; there’s simply <em>virtual</em> partitioning of data as the hardware behind it remains the same.</p>
<p>This can increase the consumption parallelism on the client side (more shards means more tasks that can read the data from Elasticsearch at once) however from Elasticsearch perspective there are no actual gains and thus query performance will likely remain the same.</p>
<h3><a id="_write_performance"></a>Write performance<a class="edit_me" rel="nofollow" title="Edit this page on GitHub" href="https://github.com/elastic/elasticsearch-hadoop/edit/8.9/docs/src/reference/asciidoc/core/performance.adoc">edit</a></h3>
<p>A crucial aspect in improving the write performance is to determine the maximum rate of data that Elasticsearch can ingest comfortably. This depends on many variables (data size, hardware, current load, etc..) but a good rule of thumb is for a bulk request to not take longer than 1-2s to be successfully processed.</p>
<p>Since elasticsearch-hadoop performs parallel writes, it is important to keep this in mind across <em>all</em> tasks, which are created by Hadoop/Spark at runtime.</p>
<h4><a id="_decrease_bulk_size"></a>Decrease bulk size<a class="edit_me" rel="nofollow" title="Edit this page on GitHub" href="https://github.com/elastic/elasticsearch-hadoop/edit/8.9/docs/src/reference/asciidoc/core/performance.adoc">edit</a></h4>
<p>Remember that elasticsearch-hadoop allows one to configure the number of entries and size for a batch write to Elasticsearch <em>per task</em>.
That is, assuming there are <code class="literal">T</code> tasks, with a configuration of <code class="literal">B</code> bytes and <code class="literal">N</code> number of documents (where <code class="literal">d</code> is the average document size), the maximum number of bulk write requests at a given point in time can be
<code class="literal">T*B</code> bytes or <code class="literal">T*N</code> number of docs (<code class="literal">T*N*d</code> in bytes) - which ever comes first.</p>
<p>Thus for a job with 5 tasks, using the defaults (1mb or 1000 docs) means up to 5mb/5000 docs bulk size (spread across shards). If this takes more than 1-2s to be processed, there’s no need to decrease it. If it’s less then that, you can try increasing it in <em>small</em> steps.</p>
<h4><a id="_use_a_maximum_limit_of_tasks_writing_to_elasticsearch"></a>Use a maximum limit of tasks writing to Elasticsearch<a class="edit_me" rel="nofollow" title="Edit this page on GitHub" href="https://github.com/elastic/elasticsearch-hadoop/edit/8.9/docs/src/reference/asciidoc/core/performance.adoc">edit</a></h4>
<p>In case of elasticsearch-hadoop, the runtime (Hadoop or Spark) can and will create multiple tasks that will write at the same time to Elasticsearch. In some cases, this leads to a disproportionate number of tasks (sometimes one or even two orders of magnitude higher) between what the user planned to use for Elasticsearch and the actual number.
This appears in particular when dealing with inputs that are highly splittable which can easily generate tens or hundreds of tasks. If the target Elasticsearch cluster has just a few nodes, likely dealing with so much data at once will be problematic.</p>
<h4><a id="_understand_why_rejections_happen"></a>Understand why rejections happen<a class="edit_me" rel="nofollow" title="Edit this page on GitHub" href="https://github.com/elastic/elasticsearch-hadoop/edit/8.9/docs/src/reference/asciidoc/core/performance.adoc">edit</a></h4>
<p>Under too much load, Elasticsearch starts rejecting documents - in such a case elasticsearch-hadoop waits for a while (default 10s) and then retries (by default up to 3 times). If Elasticsearch keeps rejecting documents, the job will eventually fail.
In such a scenario, monitor Elasticsearch (through Marvel or other plugins) and keep an eye on bulk processing. Look at the percentage of documents being rejected; it is perfectly fine to have some documents rejected but anything higher then 10-15% on a regular basis is a good indication the cluster is overloaded.</p>
<h4><a id="_keep_the_number_of_retries_and_waits_at_bay"></a>Keep the number of retries and waits at bay<a class="edit_me" rel="nofollow" title="Edit this page on GitHub" href="https://github.com/elastic/elasticsearch-hadoop/edit/8.9/docs/src/reference/asciidoc/core/performance.adoc">edit</a></h4>
<p>As mentioned above, retries happen when documents are being rejected. This can be for a number of reasons - a sudden indexing spike, node relocation, etc…​ If your job keeps being aborted as the retries fail, this is a good indication your cluster is overloaded.
Increasing it will <span class="strong strong"><strong>not</strong></span> make the problem go away, rather it will just hide it under the rug; the job will be quite slow (as likely each write will be retried several times until all documents are acknowledged).</p>
</div>
<div class="navfooter">
<span class="prev">
<a href="metrics.html">« Hadoop Metrics</a>
</span>
<span class="next">
<a href="cloud.html">Cloud/restricted environments »</a>
</span>
</div>
</div>
<!-- end body -->
</div>
<div class="col-12 order-3 col-lg-2 order-lg-3 h-almost-full-lg sticky-top-lg" id="right_col">
<div id="sticky_content">
<!-- The OTP is appended here -->
<div class="row">
<div class="col-0 col-md-4 col-lg-0" id="bottom_left_col"></div>
<div class="col-12 col-md-8 col-lg-12">
<div id="rtpcontainer">
<div class="mktg-promo" id="most-popular">
<p class="aside-heading">Most Popular</p>
<div class="pb-2">
<p class="media-type">Video</p>
<a href="https://www.elastic.co/webinars/getting-started-elasticsearch?baymax=default&elektra=docs&storm=top-video">
<p class="mb-0">Get Started with Elasticsearch</p>
</a>
</div>
<div class="pb-2">
<p class="media-type">Video</p>
<a href="https://www.elastic.co/webinars/getting-started-kibana?baymax=default&elektra=docs&storm=top-video">
<p class="mb-0">Intro to Kibana</p>
</a>
</div>
<div class="pb-2">
<p class="media-type">Video</p>
<a href="https://www.elastic.co/webinars/introduction-elk-stack?baymax=default&elektra=docs&storm=top-video">
<p class="mb-0">ELK for Logs & Metrics</p>
</a>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</section>
</div>
<div id='elastic-footer'></div>
<script src='https://www.elastic.co/elastic-footer.js'></script>
<!-- Footer Section end-->
</section>
</div>
<script src="/guide/static/jquery.js"></script>
<script type="text/javascript" src="/guide/static/docs.js"></script>
<script type="text/javascript">
window.initial_state = {}</script>
</body>
</html>