drill-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] [drill] amansinha100 commented on a change in pull request #1723: DRILL-7063: Seperate metadata cache file into summary, file metadata
Date Sun, 07 Apr 2019 20:04:19 GMT
amansinha100 commented on a change in pull request #1723: DRILL-7063: Seperate metadata cache
file into summary, file metadata
URL: https://github.com/apache/drill/pull/1723#discussion_r272849629
 
 

 ##########
 File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java
 ##########
 @@ -633,43 +716,169 @@ private void readBlockMeta(Path path, boolean dirsOnly, MetadataContext
metaCont
         parquetTableMetadataDirs.updateRelativePaths(metadataParentDirPath);
         if (!alreadyCheckedModification && tableModified(parquetTableMetadataDirs.getDirectories(),
path, metadataParentDir, metaContext, fs)) {
           parquetTableMetadataDirs =
-              (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()),
fs, true, null)).getRight();
+              (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()),
fs, true, null, true)).getRight();
           newMetadata = true;
         }
       } else {
-        parquetTableMetadata = mapper.readValue(is, ParquetTableMetadataBase.class);
+        if (isFileMetadata) {
+          parquetTableMetadata.assignFiles((mapper.readValue(is, FileMetadata.class)).getFiles());
+          if (new MetadataVersion(parquetTableMetadata.getMetadataVersion()).compareTo(new
MetadataVersion(4, 0)) >= 0) {
+            ((ParquetTableMetadata_v4) parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
+          }
+
+          if (!alreadyCheckedModification && tableModified(parquetTableMetadata.getDirectories(),
path, metadataParentDir, metaContext, fs)) {
+            parquetTableMetadata =
+                    (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()),
fs, true, null, true)).getLeft();
+            newMetadata = true;
+          }
+        } else if (isSummaryFile) {
+          MetadataSummary metadataSummary = mapper.readValue(is, Metadata_V4.MetadataSummary.class);
+          ParquetTableMetadata_v4 parquetTableMetadata_v4 = new ParquetTableMetadata_v4(metadataSummary);
+          parquetTableMetadata = (ParquetTableMetadataBase) parquetTableMetadata_v4;
+        } else {
+          parquetTableMetadata = mapper.readValue(is, ParquetTableMetadataBase.class);
+          if (new MetadataVersion(parquetTableMetadata.getMetadataVersion()).compareTo(new
MetadataVersion(3, 0)) >= 0) {
+            ((Metadata_V3.ParquetTableMetadata_v3) parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
+          }
+          if (!alreadyCheckedModification && tableModified((parquetTableMetadata.getDirectories()),
path, metadataParentDir, metaContext, fs)) {
+            parquetTableMetadata =
+                    (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()),
fs, true, null, true)).getLeft();
+            newMetadata = true;
+          }
+        }
         if (timer != null) {
           logger.debug("Took {} ms to read metadata from cache file", timer.elapsed(TimeUnit.MILLISECONDS));
           timer.stop();
         }
-        if (new MetadataVersion(parquetTableMetadata.getMetadataVersion()).compareTo(new
MetadataVersion(3, 0)) >= 0) {
-          ((ParquetTableMetadata_v3) parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
-        }
-          if (!alreadyCheckedModification && tableModified(parquetTableMetadata.getDirectories(),
path, metadataParentDir, metaContext, fs)) {
-          // TODO change with current columns in existing metadata (auto refresh feature)
-          parquetTableMetadata =
-              (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()),
fs, true, null)).getLeft();
-          newMetadata = true;
+        if (!isSummaryFile) {
+          // DRILL-5009: Remove the RowGroup if it is empty
+          List<? extends ParquetFileMetadata> files = parquetTableMetadata.getFiles();
+          if (files != null) {
+            for (ParquetFileMetadata file : files) {
+              List<? extends RowGroupMetadata> rowGroups = file.getRowGroups();
+              rowGroups.removeIf(r -> r.getRowCount() == 0);
+            }
+          }
         }
-
-        // DRILL-5009: Remove the RowGroup if it is empty
-        List<? extends ParquetFileMetadata> files = parquetTableMetadata.getFiles();
-        for (ParquetFileMetadata file : files) {
-          List<? extends RowGroupMetadata> rowGroups = file.getRowGroups();
-          rowGroups.removeIf(r -> r.getRowCount() == 0);
+        if (newMetadata) {
+          // if new metadata files were created, invalidate the existing metadata context
+          metaContext.clear();
         }
-
-      }
-      if (newMetadata) {
-        // if new metadata files were created, invalidate the existing metadata context
-        metaContext.clear();
       }
     } catch (IOException e) {
       logger.error("Failed to read '{}' metadata file", path, e);
       metaContext.setMetadataCacheCorrupted(true);
     }
   }
 
+  private Set<String> getInterestingColumns(FileSystem fs, Path metadataParentDir,
boolean autoRefreshTriggered) {
+    Metadata_V4.MetadataSummary metadataSummary = getSummary(fs, metadataParentDir, autoRefreshTriggered,
null);
+    if (metadataSummary == null) {
+      return null;
+    } else {
+      Set<String> interestingColumns = new HashSet<String>();
+      for (ColumnTypeMetadata_v4 columnTypeMetadata_v4: metadataSummary.columnTypeInfo.values())
{
+        if (columnTypeMetadata_v4.isInteresting) {
+          interestingColumns.add(String.join("", columnTypeMetadata_v4.name));
+        }
+      }
+      return interestingColumns;
+    }
+  }
+
+  private boolean getallColumnsInteresting(FileSystem fs, Path metadataParentDir, boolean
autoRefreshTriggered) {
+    Metadata_V4.MetadataSummary metadataSummary = getSummary(fs, metadataParentDir, autoRefreshTriggered,
null);
+    if (metadataSummary == null) {
+      return true;
+    }
+    return metadataSummary.isAllColumnsInteresting();
+  }
+
+  private static Path getSummaryFileName(Path metadataParentDir) {
+    Path summaryFile = new Path(metadataParentDir, METADATA_SUMMARY_FILENAME);
+    return summaryFile;
+  }
+
+  private static Path getDirFileName(Path metadataParentDir) {
+    Path metadataDirFile = new Path(metadataParentDir, METADATA_DIRECTORIES_FILENAME);
+    return metadataDirFile;
+  }
+
+  private static Path getFileMetadataFileName(Path metadataParentDir) {
+    Path fileMetadataFile = new Path(metadataParentDir, METADATA_FILENAME);
+    return fileMetadataFile;
+  }
+
+  /**
+   * Returns if metadata exists or not in that directory
+   * @param fs filesystem
+   * @param metadataParentDir parent directory that holds metadata files.
+   * @return true if metadata exists in that directory
+   * @throws IOException
+   */
+  private static boolean metadataExists(FileSystem fs, Path metadataParentDir) throws IOException
{
+    Path summaryFile = new Path(metadataParentDir, METADATA_SUMMARY_FILENAME);
+    Path metadataDirFile = new Path(metadataParentDir, METADATA_DIRECTORIES_FILENAME);
+    Path fileMetadataFile = new Path(metadataParentDir, METADATA_FILENAME);
+    if (!fs.exists(summaryFile) && !fs.exists(metadataDirFile) && !fs.exists(fileMetadataFile))
{
+      return false;
+    }
+    return true;
+  }
+
+  /**
+   * Reads the summary from the metadata cache file, if the cache file is stale recreates
the metadata
+   * @param fs
+   * @param metadataParentDir
+   * @param autoRefreshTriggered true if the auto-refresh is already triggered
+   * @param readerConfig
+   * @return returns metadata summary
+   */
+  public static Metadata_V4.MetadataSummary getSummary(FileSystem fs, Path metadataParentDir,
boolean autoRefreshTriggered, ParquetReaderConfig readerConfig) {
+    Path summaryFile = getSummaryFileName(metadataParentDir);
+    Path metadataDirFile = getDirFileName(metadataParentDir);
+    MetadataContext metaContext = new MetadataContext();
+    try {
+      // If autoRefresh is not triggered and none of the metadata files exist
+      if (!autoRefreshTriggered && metadataExists(fs, metadataParentDir)) {
 
 Review comment:
   This condition check does not look right..shouldn't it check metadataDirFile ? and should
it be based on autoRefreshTriggered being true ? 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

Mime
View raw message