Skip to content

Commit 8080733

Browse files
committed
Improved ThrottlingException handling
* This is an experimental code. Two options are explored: 1. Remove log messages from the buffer (which delays the transfer implicitly). This is safer, but some messages are lost. 2. Delay the transfer then re-try. Consequences are unknown. Risk of compromising system stability. Option heyoutline#2 is used by default. To trigger option #1, set config purge_buffer_if_throttled: true * Added logging of successful flushes (only to CloudWatch backend). Useful for troubleshooting; currently commented out * Added heap limit, restricting the Logger process to a hardwired value of 32MiB (including message queue)
1 parent baaff8d commit 8080733

3 files changed

Lines changed: 31 additions & 10 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Add `cloud_watch` and `aws` to your list of dependencies in `mix.exs`:
99

1010
```elixir
1111
def deps do
12-
[{:cloud_watch, "~> 0.3.2"},
12+
[{:cloud_watch, "~> 0.3.3"},
1313
{:aws, "~> 0.5.0"}]
1414
end
1515
```

lib/cloud_watch.ex

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ defmodule CloudWatch do
6363
log_stream_name = Keyword.get(opts, :log_stream_name)
6464
max_buffer_size = Keyword.get(opts, :max_buffer_size, @default_max_buffer_size)
6565
max_timeout = Keyword.get(opts, :max_timeout, @default_max_timeout)
66+
purge_buffer_if_throttled? = Keyword.get(opts, :purge_buffer_if_throttled, false) # see "ThrottlingException"
67+
68+
# Limit out of memory problems - slow CloudWatch connection may cause message queue grow out of bounds
69+
max_heap_size = 4194304 # words are 8 bytes, i.e. 32 MiB
70+
:erlang.process_flag(:max_heap_size, max_heap_size) # see http://erlang.org/doc/man/erlang.html
6671

6772
# AWS configuration, only if needed by the AWS library
6873
region = Keyword.get(opts, :region)
@@ -81,6 +86,7 @@ defmodule CloudWatch do
8186
log_stream_name: log_stream_name,
8287
max_buffer_size: max_buffer_size,
8388
max_timeout: max_timeout,
89+
purge_buffer_if_throttled: purge_buffer_if_throttled?,
8490
sequence_token: nil,
8591
flushed_at: nil
8692
}
@@ -94,7 +100,6 @@ defmodule CloudWatch do
94100
message = state.format
95101
|> Logger.Formatter.format(level, msg, ts, md)
96102
|> IO.chardata_to_string
97-
#buffer = List.insert_at(buffer, -1, %InputLogEvent{message: message, timestamp: ts}) # performance impact of adding at the end?
98103
buffer = [%InputLogEvent{message: message, timestamp: ts} | buffer] # buffer order is not relevant, we'll reverse or sort later if needed
99104
%{state | buffer: buffer, buffer_length: buffer_length + 1, buffer_size: buffer_size + byte_size(message) + 26}
100105
end
@@ -115,12 +120,14 @@ defmodule CloudWatch do
115120
do_flush(state, opts, log_group_name, log_stream_name)
116121
end
117122

118-
defp do_flush(%{buffer: buffer} = state, opts, log_group_name, log_stream_name) do
123+
defp do_flush(%{buffer: buffer, buffer_length: buffer_length} = state, opts, log_group_name, log_stream_name) do
119124
events = %{logEvents: Enum.sort_by(buffer, &(&1.timestamp)),
120125
logGroupName: log_group_name, logStreamName: log_stream_name, sequenceToken: state.sequence_token}
121126
case AwsProxy.put_log_events(state.client, events) do
122127
{:ok, %{"nextSequenceToken" => next_sequence_token}, _} ->
123-
{:ok, state |> purge_buffer() |> Map.put(:sequence_token, next_sequence_token)}
128+
{:ok, state |> purge_buffer() |> Map.put(:sequence_token, next_sequence_token)
129+
# |> add_internal_info("CloudWatch Log flushed buffer (#{inspect buffer_length} messages)")
130+
}
124131
{:error, {"DataAlreadyAcceptedException", "The given batch of log events has already been accepted. The next batch can be sent with sequenceToken: " <> next_sequence_token}} ->
125132
state
126133
|> Map.put(:sequence_token, next_sequence_token)
@@ -148,11 +155,21 @@ defmodule CloudWatch do
148155
|> do_flush(opts, log_group_name, log_stream_name)
149156
{:error, {"ThrottlingException", "Rate exceeded"}} ->
150157
# AWS limit is 5 requests per second per log stream. We are supposed to re-try after a delay
151-
# Sleeping here is a quick and dirty hack with possible unwanted consequences
152-
# Better approach: introduce a blackout period. Start removing old logs if buffer size exceeded 1 MB during blackout
153-
state = state |> add_internal_error("CloudWatch Log ThrottlingException: delaying transfer")
154-
Process.sleep(500)
155-
flush(state, opts)
158+
if state.purge_buffer_if_throttled do
159+
# Safe option: delay the transfer by removing all messages from the buffer (some messages will be lost!).
160+
{
161+
:ok,
162+
state
163+
|> purge_buffer()
164+
|> add_internal_error("CloudWatch Log ThrottlingException: #{inspect buffer_length} messages were lost!}")
165+
}
166+
else
167+
# Sleeping here is a quick and dirty hack with possible unwanted consequences
168+
# Better approach: introduce a blackout period. Start removing old logs if buffer size exceeded 1 MB during blackout
169+
state = state |> add_internal_error("CloudWatch Log ThrottlingException: delaying transfer")
170+
Process.sleep(500)
171+
flush(state, opts)
172+
end
156173
{:error, {"ExpiredTokenException", _}} ->
157174
# aws-elixir may require restarting of state.client; ex_aws handles expired tokens internally
158175
flush(state, opts)
@@ -167,6 +184,10 @@ defmodule CloudWatch do
167184
add_internal_message(state, :error, msg)
168185
end
169186

187+
# defp add_internal_info(state, msg) do
188+
# add_internal_message(state, :info, msg)
189+
# end
190+
170191
defp add_internal_message(state, level, msg) do
171192
utc_log? = Application.get_env(:logger, :utc_log, false)
172193
state

mix.exs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ defmodule CloudWatch.Mixfile do
33

44
def project do
55
[app: :cloud_watch,
6-
version: "0.3.2",
6+
version: "0.3.3",
77
elixir: "~> 1.5",
88
build_embedded: Mix.env == :prod,
99
start_permanent: Mix.env == :prod,

0 commit comments

Comments
 (0)