Batch ingestion using excessive amount of memory since upgrade to 0.10

Hi all,

I’ve been using 0.9 for quite a while and it was working well. I upgraded to 0.10 a couple of months ago and it has been mostly working ok, but i’ve seena huge increase in the amount of memory the batch indexing job requires. It used to be that I could batch index a week’s worth of data using 6GB for my mappers and 16GB for my reducers. After switching to 0.10 I can only index 1 days worth of data with those settings (if I’m lucky). It fails in the reduce step with an OOM error. I’ve got days that still won’t index with 24GB allocated to the reducers. It doesn’t make any sense to me since the output shards are typically 500MB in size.

I am abusing the system a bit by including so many ids, and calculating hyperuniques, but our data volume is small and it was working better well on 0.9. Is there something new that I need to configure?

Here is the job spec:

{

“type”:“index_hadoop”,

“spec”:{

“ioConfig”:{

“type”:“hadoop”,

“inputSpec”:{

“type”:“static”,

“inputFormat”:“io.druid.data.input.parquet.DruidParquetInputFormat”,

“paths”:"${path}"

}

},

“dataSchema”:{

“dataSource”:“sor_business_events_all”,

“parser”:{

“type”:“parquet”,

“parseSpec”:{

“format”:“timeAndDims”,

“timestampSpec”:{

“column”:“event_time”,

“format”:“auto”

},

“dimensionsSpec” : {

“dimensions” : [

“action_medium_type”,

“action_page_source”,

“action_reward_program_id”,

“action_reward_program_name”,

“app_id”,

“bby_sku”,

“bby_sku_category”,

“bby_sku_offer_name”,

“bby_store_id”,

“bby_sub_sku_category”,

“beacon_data”,

“brand_name”,

“chain_id”,

“chain_in_access_profile”,

“chain_name”,

“client_app_version”,

“client_event_time”,

“client_ip_address”,

“client_language”,

“client_locale”,

“client_os_name”,

“client_os_version”,

“client_request_time”,

“client_user_agent”,

“container_name”,

“decoded_battery”,

“decoded_btle_id”,

“decoded_hmac_valid”,

“decoded_nonce”,

“developer_profile_active”,

“device_brand”,

“device_carrier”,

“device_family”,

“device_id”,

“device_manufacturer”,

“device_model”,

“device_name”,

“device_platform_type”,

“discount_cents”,

“event_category_type”,

“event_city”,

“event_country”,

“event_dma”,

“event_id”,

“event_is_offline”,

“event_is_revenue_generating”,

“event_region”,

“event_status”,

“event_status_subcode”,

“event_subtype”,

“event_type”,

“external_transaction_amount_factor”,

“external_transaction_currency”,

“external_transaction_id”,

“external_transaction_type”,

“file_name”,

“fraud_check_current_invite_level”,

“fraud_check_current_scan_level”,

“fraud_check_current_walkin_level”,

“fraud_check_immediate_ban”,

“fraud_check_prev_invite_level”,

“fraud_check_prev_scan_level”,

“fraud_check_prev_walkin_level”,

“fraud_check_reason”,

“fraud_check_source”,

“fraud_suspected”,

“global_duplicate”,

“interpretation_source”,

“interpreted_at”,

“interpreted_purchase_timestamp”,

“interpreted_store_address_full”,

“interpreted_store_name”,

“interpreted_store_phone_number”,

“invalid_receipt”,

“list_entry_id”,

“list_entry_ingredient_id”,

“list_entry_keyword_id”,

“list_entry_modification_type”,

“list_entry_quantity_amount”,

“list_entry_quantity_amount_old”,

“list_entry_quantity_unit”,

“list_entry_quantity_unit_old”,

“list_entry_recipe_id”,

“list_entry_state”,

“list_entry_state_old”,

“list_entry_title”,

“list_entry_title_old”,

“list_id”,

“location_id”,

“location_is_partner”,

“location_name”,

“location_provider_id”,

“location_provider_name”,

“loyalty_program”,

“loyalty_program_bonus_awarded”,

“loyalty_program_num_previously_enrolled”,

“maturity_time”,

“measure”,

“merchant_category_list_id”,

“minimum_age_for_kicks”,

“missing_required_information”,

“no_award_found”,

“no_item_found”,

“not_receipt_image”,

“ocr_module_version”,

“ourcart_failure_code”,

“ourcart_failure_description”,

“ourcart_product_code”,

“ourcart_product_code_type”,

“ourcart_product_name”,

“ourcart_product_size_unit”,

“owner_id”,

“partner_id”,

“partner_name”,

“per_30_days_limit_reached”,

“per_day_limit_reached”,

“price_cents”,

“product_family_id”,

“product_family_name”,

“product_id”,

“product_name”,

“quantity”,

“receipt_image_partition”,

“receipt_item_rejection_reason”,

“receipt_item_was_user_specified”,

“receipt_processed_time”,

“receipt_purchase_time”,

“receipt_scan_id”,

“receipt_submission_time”,

“receipt_transcription_vendor”,

“return_item”,

“revenue_currency”,

“revenue_per_unit”,

“schema_version”,

“server_request_time”,

“session_is_badged”,

“session_start_time”,

“session_type”,

“settlement_status”,

“shown”,

“source_turnaround_time_ms”,

“store_city”,

“store_name”,

“store_region”,

“store_street_address”,

“subcategory_name”,

“survey_answers”,

“survey_http_referer”,

“survey_is_test”,

“survey_parent_question_id”,

“survey_question”,

“survey_question_id”,

“survey_question_type”,

“survey_response_id”,

“survey_start_time”,

“transaction_currency”,

“transmitter_department_id”,

“transmitter_location_id”,

“transmitter_messaging_threshold”,

“transmitter_supports_walkin”,

“transmitter_walkin_threshold”,

“user_age”,

“user_age_bracket”,

“user_city”,

“user_country”,

“user_credit_card_mastercard”,

“user_credit_card_type”,

“user_credit_card_visa”,

“user_device_id”,

“user_dma”,

“user_duplicate”,

“user_gender”,

“user_id”,

“user_is_fraudster”,

“user_is_orphaned”,

“user_is_registered”,

“user_is_revenue_generating”,

“user_is_verified”,

“user_kicks_balance”,

“user_platform_android”,

“user_platform_ios”,

“user_platform_type”,

“user_region”,

“user_registration_age_bracket”,

“user_registration_age_day”,

“user_registration_age_week”,

“user_registration_channel”,

“user_registration_invited”,

“user_registration_paid”,

“user_registration_source”,

“user_registration_time”,

“user_registration_type”,

“user_registration_version”,

“user_social_facebook”,

“user_social_twitter”,

“user_social_type”,

“user_specified_product_family_id”,

“user_today_is_badged”,

“usersave_target_id”,

“usersave_target_type”,

“video_id”,

“video_name”,

“video_units_purchased”,

“zone_chain_id”,

“zone_id”

],

“spatialDimensions” : [

{

“dimName” : “event_coordinates”,

“dims” : [“event_latitude”, “event_longitude”]

},

{

“dimName” : “location_coordinates”,

“dims” : [“location_latitude”, “location_longitude”]

},

{

“dimName” : “zone_coordinates”,

“dims” : [“zone_latitude”, “zone_longitude”]

}

]

}

}

},

“metricsSpec” : [

{

“type” : “count”,

“name” : “count”

},

{

“type” : “longSum”,

“name” : “transaction_kicks”,

“fieldName” : “transaction_kicks”

},

{

“type” : “longSum”,

“name” : “transaction_cost”,

“fieldName” : “transaction_cost”

},

{

“type” : “longSum”,

“name” : “transaction_revenue”,

“fieldName” : “transaction_revenue”

},

{

“type” : “thetaSketch”,

“name” : “user_id_sketch”,

“fieldName” : “user_id”,

“isInputThetaSketch”: false,

“size”: 16384

},

{

“type” : “hyperUnique”,

“name” : “unique_user_id”,

“fieldName” : “user_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_user_device_id”,

“fieldName” : “user_device_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_location_id”,

“fieldName” : “location_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_list_id”,

“fieldName” : “list_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_list_entry_ingredient_id”,

“fieldName” : “list_entry_ingredient_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_product_id”,

“fieldName” : “product_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_product_family_id”,

“fieldName” : “product_family_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_partner_id”,

“fieldName” : “partner_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_chain_id”,

“fieldName” : “chain_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_video_id”,

“fieldName” : “video_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_usersave_target_id”,

“fieldName” : “usersave_target_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_decoded_btle_id”,

“fieldName” : “decoded_btle_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_transmitter_location_id”,

“fieldName” : “transmitter_location_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_transmitter_department_id”,

“fieldName” : “transmitter_department_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_zone_id”,

“fieldName” : “zone_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_zone_chain_id”,

“fieldName” : “zone_chain_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_bby_store_id”,

“fieldName” : “bby_store_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_external_transaction_id”,

“fieldName” : “external_transaction_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_rolling_transaction_id”,

“fieldName” : “rolling_transaction_id”

},

{

“type” : “hyperUnique”,

“name” : “unique_ourcart_receipt_id”,

“fieldName” : “ourcart_receipt_id”

},

{

“type” : “longSum”,

“name” : “survey_response_duration”,

“fieldName” : “survey_response_duration”

},

{

“type” : “doubleSum”,

“name” : “ocr_brightness_score”,

“fieldName” : “ocr_brightness_score”

},

{

“type” : “doubleSum”,

“name” : “ocr_contrast_score”,

“fieldName” : “ocr_contrast_score”

},

{

“type” : “doubleSum”,

“name” : “ocr_element_score”,

“fieldName” : “ocr_element_score”

},

{

“type” : “doubleSum”,

“name” : “ocr_structural_score”,

“fieldName” : “ocr_structural_score”

},

{

“type” : “doubleSum”,

“name” : “sum_quantity”,

“fieldName” : “quantity”

},

{

“type” : “longSum”,

“name” : “sum_price_cents”,

“fieldName” : “price_cents”

},

{

“type” : “longSum”,

“name” : “sum_discount_cents”,

“fieldName” : “discount_cents”

},

{

“type” : “longSum”,

“name” : “shopkick_internal_points”,

“fieldName” : “shopkick_internal_points”

},

{

“type” : “longSum”,

“name” : “total_transaction_commission_amount”,

“fieldName” : “total_transaction_commission_amount”

},

{

“type” : “longSum”,

“name” : “sku_extended_price”,

“fieldName” : “sku_extended_price”

},

{

“type” : “longSum”,

“name” : “total_commission_amount”,

“fieldName” : “total_commission_amount”

},

{

“type” : “longSum”,

“name” : “single_commission_amount”,

“fieldName” : “single_commission_amount”

},

{

“type” : “longSum”,

“name” : “commission_flat_fee”,

“fieldName” : “commission_flat_fee”

},

{

“type” : “longSum”,

“name” : “commission_rate”,

“fieldName” : “commission_rate”

},

{

“type” : “longSum”,

“name” : “total_quantity”,

“fieldName” : “total_quantity”

},

{

“type” : “longSum”,

“name” : “total_external_transaction_amount”,

“fieldName” : “total_external_transaction_amount”

},

{

“type” : “longSum”,

“name” : “escrow_kicks”,

“fieldName” : “escrow_kicks”

}

],

“granularitySpec”:{

“type”:“uniform”,

“segmentGranularity”:“day”,

“queryGranularity”:“none”,

“intervals”:[

“2016-11-24/2016-11-25”

]

}

},

“tuningConfig”:{

“type”:“hadoop”,

“partitionsSpec”:{

“targetPartitionSize”:1000000

},

“buildV9Directly”:true,

“jobProperties”:{

“mapreduce.job.user.classpath.first”:true,

“mapreduce.tasktracker.map.tasks.maximum”:4,

“mapreduce.tasktracker.reduce.tasks.maximum”:2,

“mapreduce.map.memory.mb”:4096,

“mapreduce.reduce.memory.mb”:16384,

mapred.job.queue.name”:“druid”

},

“forceExtendableShardSpecs”: true

}

}

}

Thanks,

In 0.9 were you using buildV9Directly? Which specific versions of 0.9 and 0.10 were/are you using? Also, what kind of OOME are you getting? (Java heap, GC overhead, direct memory, native thread, etc)

Hi Gian,

In 0.9 we were building v9 directly. We were using 0.9.1 and are now using 0.10.0. We are getting GC overhead errors like this one below:

2017-08-25T18:41:25,459 INFO [main] io.druid.java.util.common.io.smoosh.FileSmoosher - Created smoosh file [/yarn/nm/usercache/hdfs/appcache/application_1502845246585_1002/container_e22_1502845246585_1002_01_000004/tmp/base3085817555340554441flush/final/00000.smoosh] of size [13259878] bytes.
2017-08-25T18:41:25,561 WARN [main] org.apache.hadoop.mapred.YarnChild - Exception running child : java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.lang.OutOfMemoryError: GC overhead limit exceeded
	at com.google.common.base.Throwables.propagate(Throwables.java:160)
	at io.druid.indexer.IndexGeneratorJob$IndexGeneratorReducer.reduce(IndexGeneratorJob.java:779)
	at io.druid.indexer.IndexGeneratorJob$IndexGeneratorReducer.reduce(IndexGeneratorJob.java:478)
	at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:171)
	at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:627)
	at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:389)
	at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:422)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1693)
	at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: java.util.concurrent.ExecutionException: java.lang.OutOfMemoryError: GC overhead limit exceeded
	at com.google.common.util.concurrent.AbstractFuture$Sync.getValue(AbstractFuture.java:299)
	at com.google.common.util.concurrent.AbstractFuture$Sync.get(AbstractFuture.java:272)
	at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:96)
	at io.druid.indexer.IndexGeneratorJob$IndexGeneratorReducer.reduce(IndexGeneratorJob.java:692)
	... 9 more
Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
	at java.nio.HeapByteBuffer.asReadOnlyBuffer(HeapByteBuffer.java:117)
	at io.druid.hll.HyperLogLogCollector.toByteBuffer(HyperLogLogCollector.java:487)
	at io.druid.query.aggregation.hyperloglog.HyperUniquesSerde$3.toBytes(HyperUniquesSerde.java:134)
	at io.druid.query.aggregation.hyperloglog.HyperUniquesSerde$3.toBytes(HyperUniquesSerde.java:113)
	at io.druid.segment.data.GenericIndexedWriter.write(GenericIndexedWriter.java:151)
	at io.druid.segment.serde.LargeColumnSupportedComplexColumnSerializer.serialize(LargeColumnSupportedComplexColumnSerializer.java:92)
	at io.druid.segment.IndexMergerV9.mergeIndexesAndWriteColumns(IndexMergerV9.java:460)
	at io.druid.segment.IndexMergerV9.makeIndexFiles(IndexMergerV9.java:207)
	at io.druid.segment.IndexMerger.merge(IndexMerger.java:437)
	at io.druid.segment.IndexMerger.persist(IndexMerger.java:185)
	at io.druid.indexer.IndexGeneratorJob$IndexGeneratorReducer.persist(IndexGeneratorJob.java:506)
	at io.druid.indexer.IndexGeneratorJob$IndexGeneratorReducer.access$200(IndexGeneratorJob.java:478)
	at io.druid.indexer.IndexGeneratorJob$IndexGeneratorReducer$3.doRun(IndexGeneratorJob.java:644)
	at io.druid.common.guava.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:42)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at com.google.common.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:297)
	at java.util.concurrent.AbstractExecutorService.submit(AbstractExecutorService.java:112)
	at com.google.common.util.concurrent.AbstractListeningExecutorService.submit(AbstractListeningExecutorService.java:50)
	at io.druid.indexer.IndexGeneratorJob$IndexGeneratorReducer.reduce(IndexGeneratorJob.java:637)
	... 9 more

Hey Ben,

Would you be able to run this job with the -XX:+HeapDumpOnOutOfMemoryError option, and analyze the heap dump to see what kinds of objects are most prevalent? Then share that with the list, and if something looks suspicious, that might lead to a solution.