UrlsController/src/main/resources/schemas/payload.avsc

17 lines
744 B
JSON

{
"type": "record",
"namespace": "UrlsController",
"name": "Payload",
"fields": [
{"name": "id", "type": "string"},
{"name": "original_url", "type": "string"},
{"name": "actual_url", "type": "string"}, // This should NOT be null, since only the "found" pdf-publications are processed in parquet.
{"name": "date", "type" : {"type": "long", "logicalType": "timestamp-millis"}},
{"name": "mimetype", "type": "string"},
{"name": "size", "type": ["null","string"]},
{"name": "hash", "type": "string"},
{"name": "location", "type": "string"}, // This is not null, a check is added before processing any record.
{"name": "provenance", "type": "string"}
]
}