Druid Hadoop indexing outofmemory exception

Hi Team,

We are kind of struck on an indexing task while we index around 1 GB events into Druid from Hadoop,

Problem is the reduce tasks are getting hung at 78% and then they are failing with OutOfMemory exception, we are using around 4 HyperUnique aggregator

Should we really use HyperUnique aggregator rather than the count metric provided we use appropriate group by query?

Index task json:

{
  "type":"index_hadoop",
  "spec":{
    "dataSchema":{
      "dataSource":"elmoreporting_server1",
      "parser":{
        "type":"hadoopyString",
        "parseSpec":{
          "format":"json",
          "timestampSpec":{
            "column":"timestamp",
            "format":"auto"
          },
          "dimensionsSpec":{
            "dimensions":[
              "geo_location_id",
              "buyer_geo_location",
              "geo_loc_lat_lng",
              "locale",
              "component_name",
              "page_name",
              "page_group",
              "os",
              "browser_version",
              "browser_type",
              "account_type",
              "device_id",
              **"timestamp"**
            ],
            "dimensionExclusions":[

            ],
            "spatialDimensions":[

            ]
          }
        }
      },
      "metricsSpec":[
        {
          "type":"count",
          "name":"count"
        },
        {
          "type":"hyperUnique",
          "name":"dist_visits",
          "fieldName":"visitor_id"
        },
        {
          "type":"hyperUnique",
          "name":"dist_visitors",
          "fieldName":"cookie_id"
        },
        {
          "type":"hyperUnique",
          "name":"dist_users",
          "fieldName":"d_acc_no"
        },
        {
          "type":"hyperUnique",
          "name":"dist_merchants",
          "fieldName":"d_merchant_no"
        },
        {
          "type":"hyperUnique",
          "name":"dist_transactions",
          "fieldName":"d_trans_id"
        }
      ],
      "granularitySpec":{
        "type":"uniform",
        "segmentGranularity":"HOUR",
        "queryGranularity":"HOUR",
        "intervals":[
          "2016-10-24/2016-10-28"
        ]
      }
    },
    "ioConfig":{
      "type":"hadoop",
      "partitionsSpec":{
            "type":"hashed",
            "targetPartitionSize":5000000
         },
      "inputSpec":{
        "type":"static",
        "paths":"/apps/dt/pxp/streaming/events/server/2016/10/26/00/part-*"
      }
    },
    "tuningConfig":{
      "ignoreInvalidRows":false,
      "type":"hadoop",
      "maxRowsInMemory": 250000000,
      "jobProperties":{
        "hdp.version":"2.2.9.0-3393",
        "mapreduce.job.user.classpath.first":"true",
        "mapreduce.task.timeout":1800000,
        "mapreduce.map.memory.mb":4096,
        "mapreduce.map.java.opts":"-server -Xmx3072m -Duser.timezone=UTC -Dfile.encoding=UTF-8",
        "mapreduce.reduce.memory.mb":10240,
        "mapreduce.reduce.java.opts":"-server -Xmx8192m -Duser.timezone=UTC -Dfile.encoding=UTF-8",
        "mapreduce.job.queuename":"dt_product",
        "mapred.child.ulimit":16777216,
        "mapreduce.map.output.compress":true,
        "mapred.map.output.compress.codec":"org.apache.hadoop.io.compress.SnappyCodec"
      },
      "partitionsSpec":{
        "type":"hashed",
        "targetPartitionSize":-1,
        "rowFlushBoundary":0,
        "numShards":100
      }
    }
  },
  "hadoopDependencyCoordinates":[
    "org.apache.hadoop:hadoop-client:2.6.0.2.2.9.9-2"
  ]
}

Thanks,

Sathish

Sorry the data size is around 100 GB

Hey i see you have 2 conflicting sharding specs

the first will not take effect at all since the place to specify is tuning config:

 "partitionsSpec":{
            "type":"hashed",
            "targetPartitionSize":5000000
         },

The second is telling druid to create 100 shards per interval, unless that’s what you you want i would recommend to set a target partition size instead fix number of shards

 "partitionsSpec":{
        "type":"hashed",
        "targetPartitionSize":-1,
        "rowFlushBoundary":0,
        "numShards":100
      }

then to fix the OOM problem you can decrease the target partition size and maybe increase the containers memory as well via java vm options/map-reduce options.

hope that helps