drill-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] [drill] amansinha100 commented on a change in pull request #1723: DRILL-7063: Seperate metadata cache file into summary, file metadata
Date Thu, 04 Apr 2019 18:45:24 GMT
amansinha100 commented on a change in pull request #1723: DRILL-7063: Seperate metadata cache
file into summary, file metadata
URL: https://github.com/apache/drill/pull/1723#discussion_r272308859
 
 

 ##########
 File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java
 ##########
 @@ -230,68 +254,102 @@ private static boolean ignoreReadingMetadata(MetadataContext metaContext,
Path p
    *
    * @param path to the directory of the parquet table
    * @param fs file system
-   * @param allColumns if set, store column metadata for all the columns
+   * @param allColumnsInteresting if set, store column metadata for all the columns
    * @param columnSet Set of columns for which column metadata has to be stored
    * @return Pair of parquet metadata. The left one is a parquet metadata for the table.
The right one of the Pair is
    *         a metadata for all subdirectories (if they are present and there are no any
parquet files in the
    *         {@code path} directory).
    * @throws IOException if parquet metadata can't be serialized and written to the json
file
    */
-  private Pair<ParquetTableMetadata_v3, ParquetTableMetadataDirs>
-  createMetaFilesRecursively(final Path path, FileSystem fs, boolean allColumns, Set<String>
columnSet)
-    throws IOException {
+  private Pair<ParquetTableMetadata_v4, ParquetTableMetadataDirs> createMetaFilesRecursively(final
Path path, FileSystem fs, boolean allColumnsInteresting, Set<String> columnSet) throws
IOException {
     Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
-    List<ParquetFileMetadata_v3> metaDataList = Lists.newArrayList();
+    List<ParquetFileMetadata_v4> metaDataList = Lists.newArrayList();
     List<Path> directoryList = Lists.newArrayList();
-    ConcurrentHashMap<ColumnTypeMetadata_v3.Key, ColumnTypeMetadata_v3> columnTypeInfoSet
=
+    ConcurrentHashMap<ColumnTypeMetadata_v4.Key, ColumnTypeMetadata_v4> columnTypeInfoSet
=
         new ConcurrentHashMap<>();
     FileStatus fileStatus = fs.getFileStatus(path);
+    long dirTotalRowCount = 0;
     assert fileStatus.isDirectory() : "Expected directory";
 
     final Map<FileStatus, FileSystem> childFiles = new LinkedHashMap<>();
 
     for (final FileStatus file : DrillFileSystemUtil.listAll(fs, path, false)) {
       if (file.isDirectory()) {
-        ParquetTableMetadata_v3 subTableMetadata = (createMetaFilesRecursively(file.getPath(),
fs, allColumns,
-          columnSet)).getLeft();
-        metaDataList.addAll(subTableMetadata.files);
-        directoryList.addAll(subTableMetadata.directories);
+        ParquetTableMetadata_v4 subTableMetadata = (createMetaFilesRecursively(file.getPath(),
fs, allColumnsInteresting, columnSet)).getLeft();
+        ConcurrentHashMap<ColumnTypeMetadata_v4.Key, ColumnTypeMetadata_v4> subTableColumnTypeInfo
= subTableMetadata.getColumnTypeInfoMap();
+        metaDataList.addAll((List<ParquetFileMetadata_v4>) subTableMetadata.getFiles());
+        directoryList.addAll(subTableMetadata.getDirectories());
         directoryList.add(file.getPath());
         // Merge the schema from the child level into the current level
         //TODO: We need a merge method that merges two columns with the same name but different
types
-        columnTypeInfoSet.putAll(subTableMetadata.columnTypeInfo);
+        if (columnTypeInfoSet.isEmpty()) {
+          columnTypeInfoSet.putAll(subTableColumnTypeInfo);
+        } else {
+          for (ColumnTypeMetadata_v4.Key key : subTableColumnTypeInfo.keySet()) {
+            ColumnTypeMetadata_v4 columnTypeMetadata_v4 = columnTypeInfoSet.get(key);
+            if (columnTypeMetadata_v4 == null) {
+              columnTypeMetadata_v4 = subTableColumnTypeInfo.get(key);
+            } else {
+              if (subTableColumnTypeInfo.get(key).totalNullCount < 0 || columnTypeMetadata_v4.totalNullCount
< 0) {
+                columnTypeMetadata_v4.totalNullCount = NULL_COUNT_NOT_EXISTS;
+              } else {
+                columnTypeMetadata_v4.totalNullCount = columnTypeMetadata_v4.totalNullCount
+ subTableColumnTypeInfo.get(key).totalNullCount;
+              }
+            }
+            columnTypeInfoSet.put(key, columnTypeMetadata_v4);
+          }
+        }
+        dirTotalRowCount = dirTotalRowCount + subTableMetadata.getTotalRowCount();
       } else {
         childFiles.put(file, fs);
       }
     }
-    ParquetTableMetadata_v3 parquetTableMetadata = new ParquetTableMetadata_v3(SUPPORTED_VERSIONS.last().toString(),
-                                                                                DrillVersionInfo.getVersion());
+    Metadata_V4.MetadataSummary metadataSummary = new Metadata_V4.MetadataSummary(SUPPORTED_VERSIONS.last().toString(),
DrillVersionInfo.getVersion());
+    ParquetTableMetadata_v4 parquetTableMetadata = new ParquetTableMetadata_v4(metadataSummary);
     if (childFiles.size() > 0) {
-      List<ParquetFileMetadata_v3 > childFilesMetadata = getParquetFileMetadata_v3(parquetTableMetadata,
childFiles, allColumns, columnSet);
-      metaDataList.addAll(childFilesMetadata);
-      // Note that we do not need to merge the columnInfo at this point. The columnInfo is
already added
-      // to the parquetTableMetadata.
+      List<ParquetFileAndRowCountMetadata> ChildFileAndRowCountMetadata = getParquetFileMetadata_v4(parquetTableMetadata,
childFiles, allColumnsInteresting, columnSet);
+      for (ParquetFileAndRowCountMetadata parquetFileAndRowCountMetadata : ChildFileAndRowCountMetadata)
{
+        metaDataList.add(parquetFileAndRowCountMetadata.getFileMetadata());
+        dirTotalRowCount = dirTotalRowCount + parquetFileAndRowCountMetadata.getFileRowCount();
+        Map<ColumnTypeMetadata_v4.Key, Long> totalNullCountMap = parquetFileAndRowCountMetadata.getTotalNullCountMap();
+        if (columnTypeInfoSet.isEmpty()) {
+          columnTypeInfoSet.putAll(parquetTableMetadata.getColumnTypeInfoMap());
 
 Review comment:
   Shouldn't this set be populated before entering the loop ? There does not seem to be a
dependency on anything computed in the loop. 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

Mime
View raw message