Hey,
Does batch delta ingestion duplicates data ? We have a datasource in which the the segments are correctly ingested ie. only one entry per event. We have use case in which previous day data also comes in the current data , so we do daily batch delta ingestion for those segments as well ingest new segment for current date. But for some reason duplicate events are coming in a system.
When I does batch delta ingestion for 2016-10-10T00:00:00.000Z/2016-10-13T00:00:00.000Z segments range, duplicate rows has been created for this interval segments.
Here is my ingestion spec :
{
“type”: “index_hadoop”,
“spec”: {
“dataSchema”: {
“dataSource”: “prism-data-10”,
“parser”: {
“type”: “string”,
“parseSpec”: {
“format”: “json”,
“dimensionsSpec”: {
“dimensions”: [
“event_id”,
“lang”,
“share_clicks”,
“ts_bucket”,
“old_hash_id”,
“ab_test”,
“event_name”,
“title”,
“noti_opened”,
“fullstory_time_total”,
“ts_back_valid”,
“custom_title”,
“targeted_city”,
“at”,
“short_view_event”,
“published_dt”,
“short_time”,
“notification_type”,
“variants”,
“device_id”,
“category”,
“toss_opened”,
“noti_shown”,
“event_source”,
“score”,
“author”,
“bookmark”,
“is_video”,
“source”,
“like_count”,
“share_view”,
“vid_length”,
“content”,
“fullstory_view”,
“ts_valid”,
“targeted_country”,
“video_event”,
“shortened_url”,
“toss_clicked”,
“hashId”,
“group_id”,
“img_url”,
“is_deleted”
]
},
“timestampSpec”: {
“format”: “millis”,
“column”: “at”
}
}
},
“metricsSpec”: [{
“type”: “count”,
“name”: “count”
}, {
“type”: “doubleSum”,
“name”: “fullstory_total_time”,
“fieldName”: “fullstory_time_total”
}, {
“type”: “longSum”,
“name”: “total_like_count”,
“fieldName”: “like_count”
}, {
“type”: “longMax”,
“name”: “total_share_views”,
“fieldName”: “share_views”
}, {
“type”: “longMax”,
“name”: “total_vid_length”,
“fieldName”: “vid_length”
}, {
“type”: “doubleSum”,
“name”: “total_short_time”,
“fieldName”: “short_time”
}, {
“type”: “hyperUnique”,
“name”: “distinct_user”,
“fieldName”: “device_id”
}, {
“type”: “hyperUnique”,
“name”: “distinct_event”,
“fieldName”: “event_id”
}, {
“type”: “hyperUnique”,
“name”: “distinct_hash_Id”,
“fieldName”: “hashId”
}, {
“type”: “longSum”,
“name”: “total_bookmark”,
“fieldName”: “bookmark”
}, {
“type”: “longSum”,
“name”: “total_fullstory_view”,
“fieldName”: “fullstory_view”
}, {
“type”: “longSum”,
“name”: “total_noti_opened”,
“fieldName”: “noti_opened”
}, {
“type”: “longSum”,
“name”: “total_noti_shown”,
“fieldName”: “noti_shown”
}, {
“type”: “longSum”,
“name”: “total_toss_clicked”,
“fieldName”: “toss_clicked”
}, {
“type”: “longSum”,
“name”: “total_toss_opened”,
“fieldName”: “toss_opened”
}, {
“type”: “longSum”,
“name”: “total_share_click”,
“fieldName”: “share_clicks”
}, {
“type”: “longSum”,
“name”: “total_short_views”,
“fieldName”: “short_view_event”
}, {
“type”: “longSum”,
“name”: “total_video_views”,
“fieldName”: “video_event”
}, {
“type”: “longSum”,
“name”: “total_ts_valid”,
“fieldName”: “ts_valid”
}, {
“type”: “longSum”,
“name”: “total_full_ts_valid”,
“fieldName”: “ts_back_valid”
}, {
“type”: “longMax”,
“name”: “is_ab”,
“fieldName”: “ab_test”
}, {
“type”: “longMax”,
“name”: “ab_variants”,
“fieldName”: “variants”
}],
“granularitySpec”: {
“type”: “uniform”,
“segmentGranularity”: “DAY”,
“queryGranularity”: {
“type”: “none”
},
“intervals”: [
“2016-10-10T00:00:00.000Z/2016-10-16T00:00:00.000Z”
]
}
},
“ioConfig”: {
“type”: “hadoop”,
“inputSpec”: {
“type”: “multi”,
“children”: [{
“type”: “dataSource”,
“ingestionSpec”: {
“dataSource”: “prism-data-10”,
“intervals”: [“2016-10-10T00:00:00.000Z/2016-10-13T00:00:00.000Z”]
}
}, {
“type”: “static”,
“paths”: “gs://nis-prism/new/2016/10/13//part-,gs://nis-prism/new/2016/10/14//part-”
}]
}
},
“tuningConfig”: {
“type”: “hadoop”,
“partitionsSpec”: {
“type”: “hashed”,
“targetPartitionSize”: 3000000
},
“numBackgroundPersistThreads”: 1,
“overwriteFiles”: true,
“ignoreInvalidRows” : true
}
},
“hadoopDependencyCoordinates”: [“org.apache.hadoop:hadoop-client:2.7.2”]
}