Batch ingestion of file loops until failure

Hi,
I have create a small 3 node cluster with hdfs 2.3 on a third. I was able to ingest the file days before. I now cannot. It continues to loop over this sharding process until it eventually fails. Any ideas why it take maybe a few hours for this to fail?

        },
          "shardNum" : 18604
        } ],
        "2013-12-09T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18605
        } ],
        "2013-12-10T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18606
        } ],
        "2013-12-11T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18607
        } ],
        "2013-12-12T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18608
        } ],
        "2013-12-13T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18609
        } ],
        "2013-12-14T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18610
        } ],
        "2013-12-15T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18611
        } ],
        "2013-12-16T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18612
        } ],
        "2013-12-17T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18613
        } ],
        "2013-12-18T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18614
        } ],
        "2013-12-19T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18615
        } ],
        "2013-12-20T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18616
        } ],
        "2013-12-21T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18617
        } ],
        "2013-12-22T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18618
        } ],
        "2013-12-23T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18619
        } ],
        "2013-12-24T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18620
        } ],
        "2013-12-25T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18621
        } ],
        "2013-12-26T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18622
        } ],
        "2013-12-27T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18623
        } ],
        "2013-12-28T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18624
        } ],
        "2013-12-29T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18625
        } ],
        "2013-12-30T00:00:00.000Z" : [ {
          "actualSpec" : {
            "type" : "none"
          },
          "shardNum" : 18626
        } ]
      },
      "indexSpec" : {
        "bitmap" : {
          "type" : "concise"
        },
        "dimensionCompression" : null,
        "metricCompression" : null
      },
      "maxRowsInMemory" : 75000,
      "leaveIntermediate" : false,
      "cleanupOnFailure" : true,
      "overwriteFiles" : false,
      "ignoreInvalidRows" : false,
      "jobProperties" : { },
      "combineText" : false,
      "useCombiner" : false,
      "buildV9Directly" : false,
      "numBackgroundPersistThreads" : 0
    },
    "uniqueId" : "f5a08119aa4e4e448b66c967a418d56d"
  }
}
2016-09-15T15:24:48,804 INFO [pool-23-thread-1] org.apache.hadoop.mapred.Task - Task:attempt_local485075656_0001_r_001638_0 is done. And is in the process of committing
2016-09-15T15:24:48,805 INFO [pool-23-thread-1] org.apache.hadoop.mapred.LocalJobRunner - 1 / 1 copied.
2016-09-15T15:24:48,805 INFO [pool-23-thread-1] org.apache.hadoop.mapred.Task - Task attempt_local485075656_0001_r_001638_0 is allowed to commit now
2016-09-15T15:24:48,805 INFO [pool-23-thread-1] org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter - Saved output of task 'attempt_local485075656_0001_r_001638_0' to file:/opt/druid-0.9.1.1/var/druid/hadoop-tmp/usgs_hadoop/2016-09-15T151347.171Z/f5a08119aa4e4e448b66c967a418d56d/_temporary/0/task_local485075656_0001_r_001638
2016-09-15T15:24:48,806 INFO [pool-23-thread-1] org.apache.hadoop.mapred.LocalJobRunner - reduce > reduce
2016-09-15T15:24:48,806 INFO [pool-23-thread-1] org.apache.hadoop.mapred.Task - Task 'attempt_local485075656_0001_r_001638_0' done.
2016-09-15T15:24:48,806 INFO [pool-23-thread-1] org.apache.hadoop.mapred.LocalJobRunner - Finishing task: attempt_local485075656_0001_r_001638_0
2016-09-15T15:24:48,806 INFO [pool-23-thread-1] org.apache.hadoop.mapred.LocalJobRunner - Starting task: attempt_local485075656_0001_r_001639_0
2016-09-15T15:24:48,807 INFO [pool-23-thread-1] org.apache.hadoop.mapred.Task -  Using ResourceCalculatorProcessTree : [ ]
2016-09-15T15:24:48,807 INFO [pool-23-thread-1] org.apache.hadoop.mapred.ReduceTask - Using ShuffleConsumerPlugin: org.apache.hadoop.mapreduce.task.reduce.Shuffle@71b07556
2016-09-15T15:24:48,807 INFO [pool-23-thread-1] org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl - MergerManager: memoryLimit=1408552064, maxSingleShuffleLimit=352138016, mergeThreshold=929644416, ioSortFactor=10, memToMemMergeOutputsThreshold=10
2016-09-15T15:24:48,808 INFO [EventFetcher for fetching Map Completion Events] org.apache.hadoop.mapreduce.task.reduce.EventFetcher - attempt_local485075656_0001_r_001639_0 Thread started: EventFetcher for fetching Map Completion Events
2016-09-15T15:24:48,810 INFO [localfetcher#1640] org.apache.hadoop.mapreduce.task.reduce.LocalFetcher - localfetcher#1640 about to shuffle output of map attempt_local485075656_0001_m_000000_0 decomp: 2 len: 6 to MEMORY
2016-09-15T15:24:48,810 INFO [localfetcher#1640] org.apache.hadoop.mapreduce.task.reduce.InMemoryMapOutput - Read 2 bytes from map-output for attempt_local485075656_0001_m_000000_0
2016-09-15T15:24:48,810 INFO [localfetcher#1640] org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl - closeInMemoryFile -> map-output of size: 2, inMemoryMapOutputs.size() -> 1, commitMemory -> 0, usedMemory ->2
2016-09-15T15:24:48,811 INFO [EventFetcher for fetching Map Completion Events] org.apache.hadoop.mapreduce.task.reduce.EventFetcher - EventFetcher is interrupted.. Returning
2016-09-15T15:24:48,811 INFO [pool-23-thread-1] org.apache.hadoop.mapred.LocalJobRunner - 1 / 1 copied.
2016-09-15T15:24:48,811 INFO [pool-23-thread-1] org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl - finalMerge called with 1 in-memory map-outputs and 0 on-disk map-outputs
2016-09-15T15:24:48,812 INFO [pool-23-thread-1] org.apache.hadoop.mapred.Merger - Merging 1 sorted segments
2016-09-15T15:24:48,812 INFO [pool-23-thread-1] org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 0 segments left of total size: 0 bytes
2016-09-15T15:24:48,813 INFO [pool-23-thread-1] org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl - Merged 1 segments, 2 bytes to disk to satisfy reduce memory limit
2016-09-15T15:24:48,813 INFO [pool-23-thread-1] org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl - Merging 1 files, 6 bytes from disk
2016-09-15T15:24:48,813 INFO [pool-23-thread-1] org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl - Merging 0 segments, 0 bytes from memory into reduce
2016-09-15T15:24:48,813 INFO [pool-23-thread-1] org.apache.hadoop.mapred.Merger - Merging 1 sorted segments
2016-09-15T15:24:48,813 INFO [pool-23-thread-1] org.apache.hadoop.mapred.Merger - Down to the last merge-pass, with 0 segments left of total size: 0 bytes
2016-09-15T15:24:48,813 INFO [pool-23-thread-1] org.apache.hadoop.mapred.LocalJobRunner - 1 / 1 copied.


this is my ingestion spec

{
  "type": "index_hadoop",
  "spec": {
    "dataSchema": {
      "dataSource": "usgs_hadoop",
      "parser": {
        "type": "hadoopyString",
        "parseSpec": {
          "format": "tsv",
          "columns" : ["staid","val","dates"],
          "timestampSpec": {
            "column": "dates"
          },
          "dimensionsSpec": {
            "dimensions": [ "staid", "val" ]
          }
        }
      },
      "metricsSpec": [
        {
          "type": "count",
          "name": "count"
        }
      ],
      "granularitySpec": {
        "type": "uniform",
        "segmentGranularity": "DAY",
        "queryGranularity": "NONE",
        "intervals": [
          "1963-01-01/2013-12-31"
        ]
      }
    },
    "ioConfig": {
      "type": "hadoop",
      "inputSpec": {
        "type": "static",
        "paths": "hdfs://druidsand:9000/tmp/napa-flow.tsv"
      }
    },
    "tuningConfig": {

      "type": "hadoop"
    }
  },
  "hadoopDependencyCoordinates":  ["org.apache.hadoop:hadoop-client:2.3.0"]
}

This is the file i am ingesting:


"11458000"      90      1963-01-01
"11458000"      87      1963-01-02
"11458000"      85      1963-01-03
"11458000"      80      1963-01-04
"11458000"      76      1963-01-05
"11458000"      75      1963-01-06
"11458000"      73      1963-01-07
"11458000"      71      1963-01-08
"11458000"      65      1963-01-09
"11458000"      59      1963-01-10

Can you paste the failure? Also, the interval of your data seems massive. Are you sure the interval in the ingestion spec actually matches the interval of your data?