Hadoop batch ingest errors(Can not combine streams for version 2)

Hi,

I’m ingesting batch data from HDFS to druid. The ingest works fine when size of data is < 10GB. But when I increase ingested data size to 150GB, ingest fails with below error:

Error: io.druid.java.util.common.ISE: Can not combine streams for version 2.
at io.druid.segment.data.GenericIndexedWriter.combineStreams(GenericIndexedWriter.java:316)
at io.druid.segment.StringDimensionMergerV9.writeIndexes(StringDimensionMergerV9.java:291)
at io.druid.segment.IndexMergerV9.makeIndexFiles(IndexMergerV9.java:242)
at io.druid.segment.IndexMergerV9.merge(IndexMergerV9.java:849)
at io.druid.segment.IndexMergerV9.mergeQueryableIndex(IndexMergerV9.java:723)
at io.druid.indexer.IndexGeneratorJob$IndexGeneratorReducer.mergeQueryableIndex(IndexGeneratorJob.java:529)
at io.druid.indexer.IndexGeneratorJob$IndexGeneratorReducer.reduce(IndexGeneratorJob.java:700)
at io.druid.indexer.IndexGeneratorJob$IndexGeneratorReducer.reduce(IndexGeneratorJob.java:489)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:171)
at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:627)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:389)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:168)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1724)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:162)

I’m using Druid 11. Below is ingestion spec:

{
“type” : “index_hadoop”,
“spec” : {
“dataSchema” : {
“dataSource” : “foo_datasource”,
“parser” : {
“type” : “string”,
“parseSpec” : {
“format” : “json”,
“timestampSpec” : {
“column” : “timestamp”,
“format” : “auto”
},
“dimensionsSpec” : {
“dimensions”: ,
“dimensionExclusions” : ,
“spatialDimensions” :
}
}
},
“metricsSpec” : [
{
“type” : “count”,
“name” : “count”
}
],
“granularitySpec” : {
“type” : “uniform”,
“segmentGranularity” : “DAY”,
“queryGranularity” : “NONE”,
“intervals” : [ “2018-02-18/2018-02-21” ]
}
},
“ioConfig” : {
“type” : “hadoop”,
“inputSpec” : {
“type” : “static”,
“paths” : “/PATH/TO/FILES”
}
},
“tuningConfig” : {
“type”: “hadoop”,
“forceExtendableShardSpecs” : true,
“jobProperties” : {
“mapreduce.job.user.classpath.first” : “true”,
“mapreduce.reduce.memory.mb” : “16192”
}
},
“hadoopDependencyCoordinates”: [“org.apache.hadoop:hadoop-client:2.6.0”]
}
}

Do I need to tweak any config? Each row is about 16kb. I usually ingest about 200GB job.

Thanks

Prashanth