Druid for Noobs

·

3 min read

Ingesting Data via plain text

Lets create a data-source via Native Index:

curl --location 'http://localhost:8888/druid/indexer/v1/task' \
--header 'Content-Type: application/json' \
--data '{
  "type": "index",
  "spec": {
    "dataSchema": {
      "dataSource": "from_api_customer_data",
      "dimensionsSpec": {
        "dimensions": [
          "id"
        ]
      },
      "timestampSpec": {
        "column": "timestamp",
        "format": "auto"
      },
      "metricsSpec": [],
      "granularitySpec": {
        "type": "uniform",
        "segmentGranularity": "DAY",
        "queryGranularity": "NONE",
        "intervals": ["2023-01-01/2023-01-02"] 
      }
    },
    "ioConfig": {
      "type": "index",
      "inputSource": {
        "type": "inline",
        "data": "id,timestamp,name\n1,2016-06-27T00:00:11.080Z,smit\n2,2016-06-28T00:00:11.080Z,palaks\n3,2016-06-29T00:00:11.080Z,test
"
      },
      "inputFormat": {
        "type": "csv",
        "findColumnsFromHeader":true
      }
    },
    "tuningConfig": {
      "type": "index",
      "targetPartitionSize": 5000000,   
      "maxNumConcurrentSubTasks": 5
    }
  }
}
'

Lets now try to update the data-source data:

curl --location 'http://localhost:8888/druid/indexer/v1/task' \
--header 'Content-Type: application/json' \
--data '{
  "type": "index",
  "spec": {
    "dataSchema": {
      "dataSource": "from_api_customer_data",
      "dimensionsSpec": {
        "dimensions": [
          "id"
        ]
      },
      "timestampSpec": {
        "column": "timestamp",
        "format": "auto"
      },
      "metricsSpec": [],
      "granularitySpec": {
        "type": "uniform",
        "segmentGranularity": "DAY",
        "queryGranularity": "NONE",
        "intervals": ["2023-01-01/2023-01-02"] 
      }
    },
    "ioConfig": {
      "type": "index",
      "inputSource": {
        "type": "inline",
        "data": "id,timestamp,name\n1,2016-06-27T00:00:11.080Z,smits\n4,2016-06-27T00:00:11.080Z,preeeti\n5,2016-06-29T00:00:11.080Z,pss"
      },
      "inputFormat": {
        "type": "csv",
        "findColumnsFromHeader":true
      }
    },
    "tuningConfig": {
      "type": "index",
      "targetPartitionSize": 5000000,   
      "maxNumConcurrentSubTasks": 5
    }
  }
}
'

Query Data via Latest Timestamp

Now lets say:

id

timestamp

name

1

2016-06-27T00:00:11.080Z

smit

1

2016-06-28T00:00:11.080Z

smit2

3

2016-06-28T00:00:11.080Z

Test

And u only need to record for id and name where the timestamp is the latest. In above data, it should only return rows "smit2" and "Test".

There are 2 ways to query it:

Using Common Table Expression (CTE)

WITH max_time_cte AS (
  SELECT id, MAX(__time) AS maxTime
  FROM "inline_data"
  GROUP BY id
)
SELECT i.id, m.maxTime AS "maxTime", i.name
FROM "inline_data" i
JOIN max_time_cte m ON i.id = m.id AND i.__time = m.maxTime

Using Subquery

SELECT id, MAX(__time) AS "maxTime", name
FROM "inline_data"
WHERE (__time, id) IN (
  SELECT MAX(__time) AS maxTime, id
  FROM "inline_data"
  GROUP BY id
)
GROUP BY id, name

Now lets say you need data for selected Ids only. [Lets say you chuncking the data]

Lets add more data(see appendix if you need it in plain text):

id

timestamp

name

1

2023-08-05T12:00:00

John

2

2023-08-05T13:00:00

Alice

3

2023-08-05T14:00:00

Bob

1

2023-08-05T15:00:00

Susan

4

2023-08-05T16:00:00

Michael

3

2023-08-05T17:00:00

Linda

5

2023-08-05T18:00:00

David

6

2023-08-05T19:00:00

Emily

4

2023-08-05T20:00:00

William

7

2023-08-05T21:00:00

Sophia

8

2023-08-05T22:00:00

James

5

2023-08-05T23:00:00

Emma

9

2023-08-06T00:00:00

Oliver

10

2023-08-06T01:00:00

Mia

11

2023-08-06T02:00:00

Ethan

12

2023-08-06T03:00:00

Ava

7

2023-08-06T04:00:00

Matthew

13

2023-08-06T05:00:00

Sofia

11

2023-08-06T06:00:00

Noah

14

2023-08-06T07:00:00

Avery

15

2023-08-06T08:00:00

Liam

Using Subquery:

SELECT id, MAX(__time) AS "maxTime", name
FROM "inline_data"
WHERE id IN (1, 6, 4) -- Specify the list of IDs here
  AND (__time, id) IN (
    SELECT MAX(__time) AS maxTime, id
    FROM "inline_data"
    WHERE id IN (1, 6, 4) -- Specify the list of IDs here
    GROUP BY id
  )
GROUP BY id, name

Using Common Table Expression (CTE):

WITH max_time_cte AS (
  SELECT id, MAX(__time) AS maxTime
  FROM "inline_data"
  WHERE id IN (1, 6, 4) -- Specify the list of IDs here
  GROUP BY id
)
SELECT i.id, m.maxTime AS "maxTime", i.name
FROM "inline_data" i
JOIN max_time_cte m ON i.id = m.id AND i.__time = m.maxTime
WHERE i.id IN (1, 6, 4) -- Specify the list of IDs here

The above will result in:

id

maxTime

name

1

2023-08-05T15:00:00.000Z

Susan

6

2023-08-05T19:00:00.000Z

Emily

4

2023-08-05T20:00:00.000Z

William

Appendix:

id,timestamp,name
1,2023-08-05T12:00:00,John
2,2023-08-05T13:00:00,Alice
3,2023-08-05T14:00:00,Bob
1,2023-08-05T15:00:00,Susan
4,2023-08-05T16:00:00,Michael
3,2023-08-05T17:00:00,Linda
5,2023-08-05T18:00:00,David
6,2023-08-05T19:00:00,Emily
4,2023-08-05T20:00:00,William
7,2023-08-05T21:00:00,Sophia
8,2023-08-05T22:00:00,James
5,2023-08-05T23:00:00,Emma
9,2023-08-06T00:00:00,Oliver
10,2023-08-06T01:00:00,Mia
11,2023-08-06T02:00:00,Ethan
12,2023-08-06T03:00:00,Ava
7,2023-08-06T04:00:00,Matthew
13,2023-08-06T05:00:00,Sofia
11,2023-08-06T06:00:00,Noah
14,2023-08-06T07:00:00,Avery
15,2023-08-06T08:00:00,Liam