drill-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] [drill] paul-rogers commented on a change in pull request #2026: DRILL-7330: Implement metadata usage for all format plugins
Date Sat, 14 Mar 2020 19:37:13 GMT
paul-rogers commented on a change in pull request #2026: DRILL-7330: Implement metadata usage
for all format plugins
URL: https://github.com/apache/drill/pull/2026#discussion_r392607366
 
 

 ##########
 File path: exec/java-exec/src/main/java/org/apache/drill/exec/metastore/store/MetastoreFileTableMetadataProvider.java
 ##########
 @@ -291,124 +250,74 @@ public boolean checkMetadataVersion() {
     }
   }
 
-  private ParquetTableMetadataProvider getFallbackTableMetadataProvider() throws IOException
{
-    if (fallback == null) {
-      fallback = fallbackBuilder == null ? null : fallbackBuilder.build();
-    }
-    return fallback;
-  }
+  public static class Builder<T extends Builder<T>> implements FileTableMetadataProviderBuilder<T>
{
+    protected final MetastoreMetadataProviderManager metadataProviderManager;
 
-  private void throwIfChanged() {
-    if (basicTablesRequests.hasMetastoreTableInfoChanged(metastoreTableInfo)) {
-      throw MetadataException.of(MetadataException.MetadataExceptionType.INCONSISTENT_METADATA);
-    }
-  }
+    // builder for fallback ParquetFileTableMetadataProvider
+    // for the case when required metadata is absent in Metastore
+    protected final TableMetadataProviderBuilder fallback;
 
-  public static class Builder implements ParquetFileTableMetadataProviderBuilder {
-    private final MetastoreMetadataProviderManager metadataProviderManager;
+    protected TupleMetadata schema;
 
-    private List<ReadEntryWithPath> entries;
-    private DrillFileSystem fs;
-    private TupleMetadata schema;
+    protected List<String> paths;
 
     private FileSelection selection;
 
-    // builder for fallback ParquetFileTableMetadataProvider
-    // for the case when required metadata is absent in Metastore
-    private final ParquetFileTableMetadataProviderBuilder fallback;
+    private DrillFileSystem fs;
 
     public Builder(MetastoreMetadataProviderManager source) {
-      this.metadataProviderManager = source;
-      this.fallback = new ParquetTableMetadataProviderImpl.Builder(FileSystemMetadataProviderManager.init());
+      this(source, new SimpleFileTableMetadataProvider.Builder(FileSystemMetadataProviderManager.init()));
     }
 
-    @Override
-    public ParquetFileTableMetadataProviderBuilder withEntries(List<ReadEntryWithPath>
entries) {
-      this.entries = entries;
-      fallback.withEntries(entries);
-      return this;
+    protected Builder(MetastoreMetadataProviderManager source, TableMetadataProviderBuilder
fallback) {
+      this.metadataProviderManager = source;
+      this.fallback = fallback;
     }
 
     @Override
-    public ParquetFileTableMetadataProviderBuilder withSelectionRoot(Path selectionRoot)
{
-      fallback.withSelectionRoot(selectionRoot);
-      return this;
+    public T withSchema(TupleMetadata schema) {
+      this.schema = schema;
+      return self();
     }
 
-    @Override
-    public ParquetFileTableMetadataProviderBuilder withCacheFileRoot(Path cacheFileRoot)
{
-      fallback.withCacheFileRoot(cacheFileRoot);
-      return this;
+    public T withSelection(FileSelection selection) {
+      this.selection = selection;
+      return self();
     }
 
-    @Override
-    public ParquetFileTableMetadataProviderBuilder withReaderConfig(ParquetReaderConfig readerConfig)
{
-      fallback.withReaderConfig(readerConfig);
-      return this;
+    public T withFileSystem(DrillFileSystem fs) {
+      this.fs = fs;
+      return self();
     }
 
-    @Override
-    public ParquetFileTableMetadataProviderBuilder withFileSystem(DrillFileSystem fs) {
-      fallback.withFileSystem(fs);
-      this.fs = fs;
-      return this;
+    protected T self() {
+      return (T) this;
     }
 
-    @Override
-    public ParquetFileTableMetadataProviderBuilder withCorrectCorruptedDates(boolean autoCorrectCorruptedDates)
{
-      fallback.withCorrectCorruptedDates(autoCorrectCorruptedDates);
-      return this;
+    public MetastoreMetadataProviderManager metadataProviderManager() {
+      return metadataProviderManager;
     }
 
-    @Override
-    public ParquetFileTableMetadataProviderBuilder withSelection(FileSelection selection)
{
-      fallback.withSelection(selection);
-      this.selection = selection;
-      return this;
+    public FileSelection selection() {
+      return selection;
     }
 
-    @Override
-    public ParquetFileTableMetadataProviderBuilder withSchema(TupleMetadata schema) {
-      fallback.withSchema(schema);
-      this.schema = schema;
-      return this;
+    public DrillFileSystem fs() {
+      return fs;
     }
 
     @Override
-    public ParquetTableMetadataProvider build() throws IOException {
-      MetastoreParquetTableMetadataProvider provider;
-      SchemaProvider schemaProvider = metadataProviderManager.getSchemaProvider();
-      ParquetMetadataProvider source = (ParquetTableMetadataProvider) metadataProviderManager.getTableMetadataProvider();
-
-      DrillStatsTable statsProvider = metadataProviderManager.getStatsProvider();
-      // schema passed into the builder has greater priority
-      try {
-        if (this.schema == null) {
-          schema = schemaProvider != null ? schemaProvider.read().getSchema() : null;
-        }
-      } catch (IOException e) {
-        logger.debug("Unable to deserialize schema from schema file for table: {}", metadataProviderManager.getTableInfo().name(),
e);
-      }
-      if (entries == null) {
-        if (!selection.isExpandedFully()) {
-          entries = DrillFileSystemUtil.listFiles(fs, selection.getSelectionRoot(), true).stream()
-              .map(fileStatus -> new ReadEntryWithPath(Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath())))
-              .collect(Collectors.toList());
-        } else {
-          entries = selection.getFiles().stream()
-              .map(Path::getPathWithoutSchemeAndAuthority)
-              .map(ReadEntryWithPath::new)
-              .collect(Collectors.toList());
-        }
-      }
-      provider = new MetastoreParquetTableMetadataProvider(entries, metadataProviderManager.getMetastoreRegistry(),
-          metadataProviderManager.getTableInfo(), schema, fallback, metadataProviderManager.getConfig(),
statsProvider);
-      // store results into metadataProviderManager to be able to use them when creating
new instances
-      // for the case when source wasn't provided or it contains less row group metadata
than the provider
-      if (source == null || source.getRowGroupsMeta().size() < provider.getRowGroupsMeta().size())
{
-        metadataProviderManager.setTableMetadataProvider(provider);
+    public TableMetadataProvider build() throws IOException {
+      if (!selection().isExpandedFully()) {
+        paths = DrillFileSystemUtil.listFiles(fs, selection.getSelectionRoot(), true).stream()
+            .map(fileStatus -> Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath()).toUri().getPath())
+            .collect(Collectors.toList());
+      } else {
+        paths = selection.getFiles().stream()
+            .map(path -> Path.getPathWithoutSchemeAndAuthority(path).toUri().getPath())
+            .collect(Collectors.toList());
 
 Review comment:
   Showing my ignorance here, but is this information cached between queries? Reading this
info per file for very large data sets will be expensive (which is why Impala caches the metadata,
which leads to the version issues discussed above.)

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

Mime
View raw message