Running into IndexServicePermanentException on production druid cluster

We have an existing cluster on production for couple of years now. we run into this issue very rarely that causes us to lose data for the entire indexing granularity (1 hour). Has anyone encountered this? If so, is there a fix for this?

we have 5 instances of zookeeper

2 overlords

250 middle manager (runnin 2 peons each)

2 coordinators

60 historical nodes

Caused by: com.metamx.tranquility.druid.IndexServicePermanentException: Service[druid:overlord] call failed with status: 400 Bad Request (/druid/indexer/v1/task,{“error”:“Task[index_realtime_druidtable_2018-11-21T01:00:00.000Z_0_0] already exists!”})

at com.metamx.tranquility.druid.IndexService$$anonfun$call$1$$anonfun$apply$17.apply(IndexService.scala:158)

at com.metamx.tranquility.druid.IndexService$$anonfun$call$1$$anonfun$apply$17.apply(IndexService.scala:132)

at com.twitter.util.Future$$anonfun$map$1$$anonfun$apply$6.apply(Future.scala:950)

at com.twitter.util.Try$.apply(Try.scala:13)

at com.twitter.util.Future$.apply(Future.scala:97)

at com.twitter.util.Future$$anonfun$map$1.apply(Future.scala:950)

at com.twitter.util.Future$$anonfun$map$1.apply(Future.scala:949)

at com.twitter.util.Promise$Transformer.liftedTree1$1(Promise.scala:112)

at com.twitter.util.Promise$Transformer.k(Promise.scala:112)

at com.twitter.util.Promise$Transformer.apply(Promise.scala:122)

at com.twitter.util.Promise$Transformer.apply(Promise.scala:103)

at com.twitter.util.Promise$$anon$1.run(Promise.scala:366)

at com.twitter.concurrent.LocalScheduler$Activation.run(Scheduler.scala:178)

at com.twitter.concurrent.LocalScheduler$Activation.submit(Scheduler.scala:136)

at com.twitter.concurrent.LocalScheduler.submit(Scheduler.scala:207)

at com.twitter.concurrent.Scheduler$.submit(Scheduler.scala:92)

at com.twitter.util.Promise.runq(Promise.scala:350)

at com.twitter.util.Promise.updateIfEmpty(Promise.scala:721)

at com.twitter.util.Promise.update(Promise.scala:694)

at com.twitter.util.Promise.setValue(Promise.scala:670)

at com.twitter.concurrent.AsyncQueue.offer(AsyncQueue.scala:111)

at com.twitter.finagle.netty3.transport.ChannelTransport.handleUpstream(ChannelTransport.scala:55)

at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)

at org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)

at org.jboss.netty.handler.codec.http.HttpContentDecoder.messageReceived(HttpContentDecoder.java:108)

at org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)

at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)

at org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)

at org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:296)

at org.jboss.netty.handler.codec.http.HttpChunkAggregator.messageReceived(HttpChunkAggregator.java:194)

at org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)

at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)

at org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)

at org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:296)

at org.jboss.netty.handler.codec.frame.FrameDecoder.unfoldAndFireMessageReceived(FrameDecoder.java:459)

at org.jboss.netty.handler.codec.replay.ReplayingDecoder.callDecode(ReplayingDecoder.java:536)

at org.jboss.netty.handler.codec.replay.ReplayingDecoder.messageReceived(ReplayingDecoder.java:435)

at org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)

at org.jboss.netty.handler.codec.http.HttpClientCodec.handleUpstream(HttpClientCodec.java:92)

at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)

at org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)

at org.jboss.netty.channel.SimpleChannelHandler.messageReceived(SimpleChannelHandler.java:142)

at com.twitter.finagle.netty3.channel.ChannelStatsHandler.messageReceived(ChannelStatsHandler.scala:78)

at org.jboss.netty.channel.SimpleChannelHandler.handleUpstream(SimpleChannelHandler.java:88)

at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)

at org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)

at org.jboss.netty.channel.SimpleChannelHandler.messageReceived(SimpleChannelHandler.java:142)

at com.twitter.finagle.netty3.channel.ChannelRequestStatsHandler.messageReceived(ChannelRequestStatsHandler.scala:35)

at org.jboss.netty.channel.SimpleChannelHandler.handleUpstream(SimpleChannelHandler.java:88)

at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)

at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:559)

at org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:268)

at org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:255)

at org.jboss.netty.channel.socket.nio.NioWorker.read(NioWorker.java:88)

at org.jboss.netty.channel.socket.nio.AbstractNioWorker.process(AbstractNioWorker.java:108)

at org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:337)

at org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:89)

at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178)

at org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)

at org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)

… 3 more

We are using druid v0.9.0