package eu.dnetlib.dhp.continuous_validator.utils; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.avro.AvroParquetReader; import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.util.HadoopInputFile; import org.apache.parquet.io.InputFile; import org.slf4j.LoggerFactory; public class ParquetUtils { private static final org.slf4j.Logger logger = LoggerFactory.getLogger(ParquetUtils.class); private static final Configuration parquetConfig = new Configuration(); public static List getParquetRecords(String fullFilePath) { InputFile inputFile; try { // TODO - Verify that this will create any directories which do not exist in the provided path. Currently // we create the directories beforehand. inputFile = HadoopInputFile.fromPath(new Path(fullFilePath), parquetConfig); // logger.trace("Created the parquet " + outputFile); // DEBUG! } catch (Throwable e) { // The simple "Exception" may not be thrown here, but an "Error" may be thrown. // "Throwable" catches EVERYTHING! logger.error("", e); return null; } List records = new ArrayList<>(); GenericRecord record; try (ParquetReader reader = AvroParquetReader. builder(inputFile).build()) { while ((record = reader.read()) != null) { records.add(record); } } catch (Throwable e) { // The simple "Exception" may not be thrown here, but an "Error" may be thrown. // "Throwable" catches EVERYTHING! logger.error("Problem when creating the \"ParquetWriter\" object or when writing the records with it!", e); // At some point, I got an "NoSuchMethodError", because of a problem in the AvroSchema file: // (java.lang.NoSuchMethodError: org.apache.avro.Schema.getLogicalType()Lorg/apache/avro/LogicalType;). // The error was with the schema: {"name": "date", "type" : ["null", {"type" : "long", "logicalType" : // "timestamp-millis"}]}, return null; } return records; // It may be empty. } public static Map getIdXmlMapFromParquetFile(String parquetFileFullPath) { List recordList = ParquetUtils.getParquetRecords(parquetFileFullPath); if (recordList == null) return null; // The error is already logged. else if (recordList.isEmpty()) { logger.error("The parquet-file \"" + parquetFileFullPath + "\" had no records inside!"); return null; } Map idXmlMap = new HashMap<>(); for (GenericRecord record : recordList) { if (logger.isTraceEnabled()) logger.trace(record.toString()); Object id = record.get("id"); if (id == null) continue; String idStr = id.toString(); Object encoding = record.get("encoding"); if (encoding == null) { logger.warn("Record with id = \"" + idStr + "\" does not provide the encoding for its body!"); continue; } String encodingStr = encoding.toString(); if (!encodingStr.equals("XML")) { logger.warn("Record with id = \"" + idStr + "\" does not have XML encoding for its body!"); continue; } Object body = record.get("body"); if (body == null) { logger.warn("Record with id = \"" + idStr + "\" does not have a body!"); continue; } String bodyStr = body.toString(); idXmlMap.put(idStr, bodyStr); // logger.debug(idStr + " | " + idXmlMap.get(idStr)); } return idXmlMap; } }