Can we do re-index with hadoop job?

hi, guys
I want to reindex a datasource ‘myoldds’ into another name ‘mynewds’, and I tried the query below.
It is running and but it seems running locally and very slow.
Can we do reindex on existing segments on hadoop? How to write the task json? I could find a full example anywhere.
Thanks in advance :slight_smile:

{

“type”: “index”,

“spec”: {

“dataSchema”: {

“dataSource”: “mynewds”,

“parser”: {

“type”: “string”,

“parseSpec”: {

“format”: “json”,

“timestampSpec”: {

“column”: “ts”,

“format”: “millis”

},

“dimensionsSpec”: {

“dimensions”: [

“visitKey”,

“visitKeyDesc”,

“imei_dailyReportOldUser”,

“intDay”

],

“spatialDimensions”:

}

}

},

“metricsSpec”: [{

“type”: “count”,

“name”: “cnt”

}, {

“type”: “thetaSketch”,

“name”: “imei_sessionId_sketch_x256”,

“fieldName”: “imei_sessionId_Dup”,

“size”: 4194304

}, {

“type”: “thetaSketch”,

“name”: “imei_sketch_x256”,

“fieldName”: “imei_Dup”,

“size”: 4194304

}],

“granularitySpec”: {

“type”: “uniform”,

“segmentGranularity”: “DAY”,

“queryGranularity”: “DAY”,

“intervals”: [

“2016-11-05T00:00:00.000+08:00/2016-11-06T00:00:00.000+08:00”

]

}

},

“ioConfig”: {

“type”: “index”,

“firehose”: {

“type”: “ingestSegment”,

“dataSource”: “myoldds”,

“interval”: “2016-11-05T00:00:00.000+08:00/2016-11-06T00:00:00.000+08:00”

}

},

“tuningConfig”: {

“type”: “index”,

“ignoreInvalidRows”: true,

“combineText”: true,

“persistInHeap”: false,

“indexSpec”: {

“bitmap”: {

“type”: “concise”

}

},

“rowFlushBoundary”: 40000,

“buildV9Directly”: true,

“partitionsSpec”: {

“type”: “hashed”,

“targetPartitionSize”: 1000000

},

“jobProperties”: {

“mapreduce.job.queuename”: “default”,

“mapreduce.map.speculative”: “true”,

“mapreduce.reduce.speculative”: “false”,

“mapreduce.job.running.map.limit”: “128”,

“mapreduce.job.running.reduce.limit”: “128”,

“mapreduce.job.user.classpath.first”: “false”,

“mapreduce.job.reduce.slowstart.completedmaps”: “1.0”,

“mapreduce.reduce.shuffle.input.buffer.percent”: “0.3”,

“mapreduce.input.fileinputformat.split.minsize”: “134217728”,

“mapreduce.input.fileinputformat.split.maxsize”: “134217728”,

“mapreduce.map.memory.mb”: “3000”,

“mapreduce.task.io.sort.mb”: “256”,

“mapreduce.task.io.sort.factor”: “100”,

“mapreduce.map.output.compress”: “true”,

“yarn.app.mapreduce.am.command-opts”: “-Xmx2048m -XX:ErrorFile=/tmp/hs_err_task_@taskid@_pid%p.log”,

“mapreduce.map.java.opts”: “-XX:+UseParallelOldGC -XX:ParallelGCThreads=4 -XX:-OmitStackTraceInFastThrow -server -Xmn400m -Xms2200m -Xmx2200m -Duser.timezone=Asia/Shanghai -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:PermSize=256m -XX:MaxPermSize=256m -XX:-UseGCOverheadLimit -XX:ErrorFile=/tmp/hs_err_task_@taskid@_pid%p.log”,

“mapreduce.reduce.memory.mb”: “6000”,

“mapreduce.reduce.java.opts”: “-XX:-OmitStackTraceInFastThrow -server -XX:+UseG1GC -Xmx4500m -Xms4500m -Duser.timezone=Asia/Shanghai -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:-UseGCOverheadLimit -XX:ErrorFile=/tmp/hs_err_task_@taskid@_pid%p.log”

}

}

},

“context”: {

“druid.indexer.runner.javaOpts”: “-XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:MaxPermSize=256m -XX:PermSize=256m -Xms1024m -Xmx1024m”

},

“fields”: {

“dimensions”: [

“visitKey”,

“visitKeyDesc”,

“imei_dailyReportOldUser”,

“intDay”

],

“metrics”: [

“cnt”,

“imei_sketch_x256”,

“imei_sessionId_sketch_x256”

]

}

}

Problem solved. Some one gave me an example. I paste it here just for anyone else need it.

{

“type”: “index_hadoop”,

“spec”: {

“ioConfig”: {

“type”: “hadoop”,

“inputSpec”: {

“type”: “dataSource”,

“ingestionSpec”: {

“dataSource”: “DATE_SOURCE”,

“intervals”:

}

}

},

“dataSchema”: {

“dataSource”: “DATE_SOURCE”,

“granularitySpec”: {

“type”: “uniform”,

“segmentGranularity”: “SEGMENT_GRANULARITY”,

“queryGranularity”: “QUERY_GRANULARITY”,

“intervals”:

},

“parser”: {

“type”: “map”,

“parseSpec”: {

“format”: “timeAndDims”,

“timestampSpec”: {

“column”: “timestamp”,

“format”: “auto”

},

“dimensionsSpec”: { }

}

},

“metricsSpec”:

},

“tuningConfig”: {

“type”: “hadoop”,

“partitionsSpec”: {

“targetPartitionSize”: 5000000

},

“jobProperties”: {

“mapreduce.job.user.classpath.first”: “true”

},

“useCombiner”: true

}

}

}

在 2016年12月14日星期三 UTC+8下午12:45:29,李斯宁写道:

nice

在 2016年12月14日星期三 UTC+8下午1:17:50,李斯宁写道: