Hi,
I’m hitting a strange problem on my druid 0.15 cluster.
I’m ingesting parquet files and that works well.
For my tests, it only contains 2 lines.
After ingestion, in the segment screen of the UI, I can see my segment, with all my dimensions.
But when try to request my datasource, I’m getting some errors.
For example, when I try to retrieve a particular field named “create_time” :
select create_time from glh_enriched_sale
``
gives me the following error
Unknown exception / org.apache.calcite.runtime.CalciteContextException: From line 1, column 23 to line 1, column 33: Column ‘create_time’ not found in any table / org.apache.calcite.tools.ValidationException
But I see it should be here in the segment UI :
Segment ID
glh_enriched_sale_2018-04-22T00:00:00.000Z_2018-04-23T00:00:00.000Z_2019-08-07T15:42:42.307Z
Dimensions (78)
id,
create_time,
transaction_time, transaction_type_code,
transaction_type_id,
update_date
Metrics (7)
amt_discount, qty
``
I wondered if that could come from my source file, so I checked it
I opened it through python and here is what it gives me :
business_date create_time transaction_time qty transaction_type_code update_date
0 2018-04-22T00:00:00.000Z None 2018-04-22T15:40:03.000Z 1 11 2019-06-08T23:49:42.000Z
1 2018-04-22T00:00:00.000Z None 2018-04-22T15:40:03.000Z 1 11 2019-06-08T23:49:42.000Z
``
And here is my ingestion spec :
Hi,
I’m hitting a strange problem on my druid 0.15 cluster.
I’m ingesting parquet files and that works well.
For my tests, it only contains 2 lines.
After ingestion, in the segment screen of the UI, I can see my segment, with all my dimensions.
But when try to request my datasource, I’m getting some errors.
For example, when I try to retrieve a particular field named “create_time” :
select create_time from glh_enriched_sale
gives me the following error
Unknown exception / org.apache.calcite.runtime.CalciteContextException: From line 1, column 23 to line 1, column 33: Column ‘create_time’ not found in any table / org.apache.calcite.tools.ValidationException
But I see it should be here in the segment UI :
Segment ID
glh_enriched_sale_2018-04-22T00:00:00.000Z_2018-04-23T00:00:00.000Z_2019-08-07T15:42:42.307Z
Dimensions (78)
id,
create_time,
transaction_time, transaction_type_code,
transaction_type_id,
update_date
Metrics (7)
amt_discount, qty
I wondered if that could come from my source file, so I checked it
I opened it through python and here is what it gives me :
business_date create_time transaction_time qty transaction_type_code update_date
0 2018-04-22T00:00:00.000Z None 2018-04-22T15:40:03.000Z 1 11 2019-06-08T23:49:42.000Z
1 2018-04-22T00:00:00.000Z None 2018-04-22T15:40:03.000Z 1 11 2019-06-08T23:49:42.000Z
And here is my ingestion spec :
{
“type”: “index_hadoop”,
“spec”: {
“dataSchema”: {
“dataSource”: “qa_enriched_sale”,
“parser”: {
“type”: “parquet”,
“parseSpec”: {
“format”: “timeAndDims”,
“dimensionsSpec”: {
“dimensions”: [
“id”,
“create_time”,
“transaction_time”,
“transaction_type_code”,
“transaction_type_id”,
“update_date”
]
},
“timestampSpec”: {
“column”: “business_date”,
“format”: “iso”
}
}
},
“metricsSpec”: [
{
“type”: “doubleSum”,
“name”: “amt_discount”,
“fieldName”: “amt_discount”
},
{
“type”: “longSum”,
“name”: “qty”,
“fieldName”: “qty”
}
],
“granularitySpec”: {
“type”: “uniform”,
“segmentGranularity”: “DAY”,
“queryGranularity”: “DAY”,
“rollup”: true,
“intervals”: [
“2018-04-21T00:00:00.000Z/2018-04-23T00:00:00.000Z”
]
},
“transformSpec”: {
“filter”: null,
“transforms”:
}
},
“ioConfig”: {
“type”: “hadoop”,
“inputSpec”: {
“paths”: “s3a://bucketfolder/”,
“inputFormat”: “org.apache.druid.data.input.parquet.DruidParquetInputFormat”,
“type”: “static”
},
“metadataUpdateSpec”: null,
“segmentOutputPath”: null
},
“tuningConfig”: {
“type”: “hadoop”,
“workingPath”: null,
“version”: “2019-05-10T13:31:45.290Z”,
“partitionsSpec”: {
“type”: “hashed”,
“targetPartitionSize”: -1,
“maxPartitionSize”: -1,
“assumeGrouped”: false,
“numShards”: 1,
“partitionDimensions”: [
“business_date”
]
},
“shardSpecs”: {},
“indexSpec”: {
“bitmap”: {
“type”: “roaring”,
“compressRunOnSerialization”: true
},
“dimensionCompression”: “lz4”,
“metricCompression”: “lz4”,
“longEncoding”: “longs”
},
“maxRowsInMemory”: 75000,
“maxBytesInMemory”: 0,
“leaveIntermediate”: false,
“cleanupOnFailure”: true,
“overwriteFiles”: false,
“ignoreInvalidRows”: true,
“jobProperties”: {
“mapreduce.map.output.compress”: “true”,
“mapreduce.map.java.opts”: “-server -Xmx750m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseG1GC”,
“mapreduce.input.fileinputformat.split.maxsize”: “10737418250”,
“mapreduce.reduce.memory.mb”: “6000”,
“mapreduce.fileoutputcommitter.algorithm.version”: “2”,
“fs.s3a.impl”: “org.apache.hadoop.fs.s3a.S3AFileSystem”,
“fs.s3a.aws.credentials.provider”: “com.amazonaws.auth.InstanceProfileCredentialsProvider”,
“fs.s3a.server-side-encryption-algorithm”: “AES256”,
“fs.s3.impl”: “org.apache.hadoop.fs.s3a.S3AFileSystem”,
“mapreduce.map.memory.mb”: “3000”,
“mapreduce.input.fileinputformat.split.minsize”: “5368709120”,
“mapreduce.task.timeout”: “1800000”,
“mapreduce.reduce.java.opts”: “-server -Xmx1500m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseG1GC”,
“io.compression.codecs”: “org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec”,
“mapreduce.job.user.classpath.first”: “true”
},
“combineText”: false,
“useCombiner”: true,
“buildV9Directly”: true,
“numBackgroundPersistThreads”: 1,
“forceExtendableShardSpecs”: false,
“useExplicitVersion”: false,
“allowedHadoopPrefix”: ,
“logParseExceptions”: false,
“maxParseExceptions”: 2147483647
}
},
"hadoopDependencyCoordinates": null,
"classpathPrefix": null
}
``
Is it normal the column is not available for request ?
I was expecting to get null or empty values but not errors.
How could I fix that ?
Thanks
Guillaume