drill-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] [drill] paul-rogers commented on a change in pull request #2026: DRILL-7330: Implement metadata usage for all format plugins
Date Sat, 14 Mar 2020 19:37:13 GMT
paul-rogers commented on a change in pull request #2026: DRILL-7330: Implement metadata usage
for all format plugins
URL: https://github.com/apache/drill/pull/2026#discussion_r392606953
 
 

 ##########
 File path: exec/java-exec/src/main/java/org/apache/drill/exec/metastore/store/MetastoreFileTableMetadataProvider.java
 ##########
 @@ -15,149 +15,108 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.drill.exec.metastore;
+package org.apache.drill.exec.metastore.store;
 
 import org.apache.drill.common.expression.SchemaPath;
 import org.apache.drill.exec.exception.MetadataException;
-import org.apache.drill.exec.metastore.MetastoreMetadataProviderManager.MetastoreMetadataProviderConfig;
+import org.apache.drill.exec.metastore.MetastoreMetadataProviderManager;
 import org.apache.drill.exec.planner.common.DrillStatsTable;
 import org.apache.drill.exec.record.SchemaUtil;
 import org.apache.drill.exec.record.metadata.TupleMetadata;
 import org.apache.drill.exec.record.metadata.schema.SchemaProvider;
 import org.apache.drill.exec.store.dfs.DrillFileSystem;
 import org.apache.drill.exec.store.dfs.FileSelection;
-import org.apache.drill.exec.store.dfs.ReadEntryWithPath;
-import org.apache.drill.exec.store.parquet.ParquetFileTableMetadataProviderBuilder;
-import org.apache.drill.exec.store.parquet.ParquetReaderConfig;
-import org.apache.drill.exec.store.parquet.ParquetTableMetadataProviderImpl;
 import org.apache.drill.exec.store.parquet.ParquetTableMetadataUtils;
 import org.apache.drill.exec.util.DrillFileSystemUtil;
-import org.apache.drill.metastore.MetastoreRegistry;
 import org.apache.drill.metastore.components.tables.BasicTablesRequests;
 import org.apache.drill.metastore.components.tables.MetastoreTableInfo;
 import org.apache.drill.metastore.metadata.BaseTableMetadata;
 import org.apache.drill.metastore.metadata.FileMetadata;
 import org.apache.drill.metastore.metadata.NonInterestingColumnsMetadata;
 import org.apache.drill.metastore.metadata.PartitionMetadata;
-import org.apache.drill.metastore.metadata.RowGroupMetadata;
 import org.apache.drill.metastore.metadata.SegmentMetadata;
 import org.apache.drill.metastore.metadata.TableInfo;
 import org.apache.drill.metastore.metadata.TableMetadata;
+import org.apache.drill.metastore.metadata.TableMetadataProvider;
+import org.apache.drill.metastore.metadata.TableMetadataProviderBuilder;
 import org.apache.drill.metastore.statistics.ColumnStatistics;
 import org.apache.drill.metastore.statistics.ColumnStatisticsKind;
 import org.apache.drill.metastore.statistics.Statistic;
 import org.apache.drill.metastore.statistics.StatisticsHolder;
 import org.apache.drill.metastore.util.SchemaPathUtils;
-import org.apache.drill.shaded.guava.com.google.common.collect.LinkedListMultimap;
-import org.apache.drill.shaded.guava.com.google.common.collect.Multimap;
 import org.apache.hadoop.fs.Path;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 
-public class MetastoreParquetTableMetadataProvider implements ParquetTableMetadataProvider
{
-  private static final Logger logger = LoggerFactory.getLogger(MetastoreParquetTableMetadataProvider.class);
-
-  private final BasicTablesRequests basicTablesRequests;
-  private final TableInfo tableInfo;
-  private final MetastoreTableInfo metastoreTableInfo;
-  private final TupleMetadata schema;
-  private final List<ReadEntryWithPath> entries;
-  private final List<String> paths;
-  private final DrillStatsTable statsProvider;
-
-  private final boolean useSchema;
-  private final boolean useStatistics;
-  private final boolean fallbackToFileMetadata;
-
-  private BaseTableMetadata tableMetadata;
-  private Map<Path, SegmentMetadata> segmentsMetadata;
-  private List<PartitionMetadata> partitions;
-  private Map<Path, FileMetadata> files;
-  private Multimap<Path, RowGroupMetadata> rowGroups;
-  private NonInterestingColumnsMetadata nonInterestingColumnsMetadata;
-  // stores builder to provide lazy init for fallback ParquetTableMetadataProvider
-  private final ParquetFileTableMetadataProviderBuilder fallbackBuilder;
-  private ParquetTableMetadataProvider fallback;
-
-  private MetastoreParquetTableMetadataProvider(List<ReadEntryWithPath> entries,
-      MetastoreRegistry metastoreRegistry, TableInfo tableInfo, TupleMetadata schema,
-      ParquetFileTableMetadataProviderBuilder fallbackBuilder, MetastoreMetadataProviderConfig
config, DrillStatsTable statsProvider) {
-    this.basicTablesRequests = metastoreRegistry.get().tables().basicRequests();
-    this.tableInfo = tableInfo;
-    this.metastoreTableInfo = basicTablesRequests.metastoreTableInfo(tableInfo);
-    this.useSchema = config.useSchema();
-    this.useStatistics = config.useStatistics();
-    this.fallbackToFileMetadata = config.fallbackToFileMetadata();
-    this.schema = schema;
-    this.entries = entries == null ? new ArrayList<>() : entries;
-    this.fallbackBuilder = fallbackBuilder;
-    this.statsProvider = statsProvider;
-    this.paths = this.entries.stream()
-        .map(readEntryWithPath -> readEntryWithPath.getPath().toUri().getPath())
-        .collect(Collectors.toList());
-  }
-
-  @Override
-  public boolean isUsedMetadataCache() {
-    return false;
-  }
-
-  @Override
-  public Path getSelectionRoot() {
-    return getTableMetadata().getLocation();
-  }
-
-  @Override
-  public List<ReadEntryWithPath> getEntries() {
-    return entries;
-  }
-
-  @Override
-  public List<RowGroupMetadata> getRowGroupsMeta() {
-    return new ArrayList<>(getRowGroupsMetadataMap().values());
-  }
+/**
+ * Implementation of {@link TableMetadataProvider} which uses Drill Metastore for providing
table metadata
+ * for file-based tables.
+ */
+public class MetastoreFileTableMetadataProvider implements TableMetadataProvider {
+  private static final Logger logger = LoggerFactory.getLogger(MetastoreFileTableMetadataProvider.class);
+
+  protected final BasicTablesRequests basicTablesRequests;
+  protected final TableInfo tableInfo;
+  protected final MetastoreTableInfo metastoreTableInfo;
+  protected final TupleMetadata schema;
+  protected final List<String> paths;
+  protected final DrillStatsTable statsProvider;
+  protected final TableMetadataProviderBuilder fallbackBuilder;
+
+  protected final boolean useSchema;
+  protected final boolean useStatistics;
+  protected final boolean fallbackToFileMetadata;
+
+  protected BaseTableMetadata tableMetadata;
+  protected Map<Path, SegmentMetadata> segmentsMetadata;
+  protected List<PartitionMetadata> partitions;
+  protected Map<Path, FileMetadata> files;
 
 Review comment:
   One area were we struggled in Impala is the sheer number of partitions and files. Impala
caches these in its "state store", replicated in each "coordinator" (like Drill's foreman.)
The idea works well -- until the system scales. At that point, it takes a huge amount of memory
to store the information. Impala went to great lengths to compress the information to control
memory use.
   
   I don't think Drill has the scale of users as Impala (sadly), but if it did, how would
this structure scale? @dobesv just discussed a case on the user list where he might have millions
of files. Will this hash map work then?
   
   Also, the files form a hierarchy of directories (partitions). Does the same file metadata
appear down the partition tree, or do we have to do a hash lookup from the partition to the
files map for each of our million files?
   
   Another issue encountered in Impala is the cost of refresh when new files arrive. Reloading
all the metadata was slow. Impala had no versioning so that a query could start planning with
version 1 of metadata, which could be replaced during planning. Once the query realizes it
is now working with version 2 objects, it throws an exception and starts planning over from
scratch.
   
   How does Drill handle this case?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

Mime
View raw message