drill-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cgi...@apache.org
Subject [drill] branch master updated: DRILL-7716: Create Format Plugin for SPSS Files
Date Wed, 06 May 2020 02:03:37 GMT
This is an automated email from the ASF dual-hosted git repository.

cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git


The following commit(s) were added to refs/heads/master by this push:
     new 5a067da  DRILL-7716: Create Format Plugin for SPSS Files
5a067da is described below

commit 5a067daf114ebdc61334989bfcdc249fb335f68d
Author: Charles Givre <cgivre@apache.org>
AuthorDate: Tue May 5 20:34:11 2020 -0400

    DRILL-7716: Create Format Plugin for SPSS Files
---
 .../drill/exec/store/excel/TestExcelFormat.java    |  34 +--
 .../drill/exec/store/hdf5/TestHDF5Format.java      |  40 +---
 contrib/format-spss/README.md                      |  93 +++++++++
 contrib/format-spss/pom.xml                        |  88 ++++++++
 .../drill/exec/store/spss/SpssBatchReader.java     | 231 +++++++++++++++++++++
 .../drill/exec/store/spss/SpssFormatConfig.java    |  78 +++++++
 .../drill/exec/store/spss/SpssFormatPlugin.java    |  85 ++++++++
 .../main/resources/bootstrap-format-plugins.json   |  37 ++++
 .../src/main/resources/drill-module.conf           |  23 ++
 .../drill/exec/store/spss/TestSpssReader.java      | 163 +++++++++++++++
 .../src/test/resources/spss/testdata.sav           | Bin 0 -> 14629 bytes
 .../native/client/src/protobuf/UserBitShared.pb.cc |  16 +-
 .../native/client/src/protobuf/UserBitShared.pb.h  |   1 +
 contrib/pom.xml                                    |   1 +
 distribution/pom.xml                               |   5 +
 distribution/src/assemble/component.xml            |   1 +
 .../java/org/apache/drill/test/ClusterTest.java    |   4 +-
 .../java/org/apache/drill/test/QueryTestUtil.java  |  40 ++++
 pom.xml                                            |   4 +-
 .../org/apache/drill/exec/proto/UserBitShared.java |  22 +-
 protocol/src/main/protobuf/UserBitShared.proto     |   1 +
 21 files changed, 879 insertions(+), 88 deletions(-)

diff --git a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
index 5700b40..3eed776 100644
--- a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
+++ b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
@@ -20,41 +20,27 @@ package org.apache.drill.exec.store.excel;
 
 import org.apache.drill.common.exceptions.DrillRuntimeException;
 import org.apache.drill.common.types.TypeProtos;
-import org.apache.drill.exec.ExecTest;
 import org.apache.drill.exec.record.metadata.TupleMetadata;
 import org.apache.drill.exec.rpc.RpcException;
 import org.apache.drill.exec.physical.rowSet.RowSet;
 import org.apache.drill.exec.physical.rowSet.RowSetBuilder;
-import org.apache.drill.exec.store.dfs.ZipCodec;
 import org.apache.drill.test.ClusterFixture;
 import org.apache.drill.test.ClusterTest;
 import org.apache.drill.test.QueryBuilder;
 import org.apache.drill.test.rowSet.RowSetComparison;
 import org.apache.drill.exec.record.metadata.SchemaBuilder;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.CommonConfigurationKeys;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IOUtils;
 
-import java.io.FileInputStream;
 import java.nio.file.Paths;
-import org.apache.hadoop.io.compress.CompressionCodec;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.experimental.categories.Category;
 import org.apache.drill.categories.RowSetTests;
-import org.apache.hadoop.io.compress.CompressionCodecFactory;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.fail;
+import static org.apache.drill.test.QueryTestUtil.generateCompressedFile;
 
 @Category(RowSetTests.class)
 public class TestExcelFormat extends ClusterTest {
@@ -399,22 +385,4 @@ public class TestExcelFormat extends ClusterTest {
 
     new RowSetComparison(expected).verifyAndClearAll(results);
   }
-
-  private void generateCompressedFile(String fileName, String codecName, String outFileName) throws IOException {
-    FileSystem fs = ExecTest.getLocalFileSystem();
-    Configuration conf = fs.getConf();
-    conf.set(CommonConfigurationKeys.IO_COMPRESSION_CODECS_KEY, ZipCodec.class.getCanonicalName());
-    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
-
-    CompressionCodec codec = factory.getCodecByName(codecName);
-    assertNotNull(codecName + " is not found", codec);
-
-    Path outFile = new Path(dirTestWatcher.getRootDir().getAbsolutePath(), outFileName);
-    Path inFile = new Path(dirTestWatcher.getRootDir().getAbsolutePath(), fileName);
-
-    try (InputStream inputStream = new FileInputStream(inFile.toUri().toString());
-         OutputStream outputStream = codec.createOutputStream(fs.create(outFile))) {
-      IOUtils.copyBytes(inputStream, outputStream, fs.getConf(), false);
-    }
-  }
 }
diff --git a/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Format.java b/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Format.java
index a34488b..03ec0a9 100644
--- a/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Format.java
+++ b/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Format.java
@@ -20,10 +20,8 @@ package org.apache.drill.exec.store.hdf5;
 
 import org.apache.drill.categories.RowSetTests;
 import org.apache.drill.common.types.TypeProtos;
-import org.apache.drill.exec.ExecTest;
 import org.apache.drill.exec.record.metadata.TupleMetadata;
 import org.apache.drill.exec.rpc.RpcException;
-import org.apache.drill.exec.store.dfs.ZipCodec;
 import org.apache.drill.test.ClusterFixtureBuilder;
 import org.apache.drill.test.ClusterTest;
 import org.apache.drill.exec.physical.rowSet.RowSet;
@@ -31,28 +29,17 @@ import org.apache.drill.exec.physical.rowSet.RowSetBuilder;
 import org.apache.drill.test.ClusterFixture;
 import org.apache.drill.test.rowSet.RowSetComparison;
 import org.apache.drill.exec.record.metadata.SchemaBuilder;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.CommonConfigurationKeys;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IOUtils;
-import org.apache.hadoop.io.compress.CompressionCodec;
-import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.experimental.categories.Category;
 
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
+import static org.apache.drill.test.QueryTestUtil.generateCompressedFile;
 
 @Category(RowSetTests.class)
 public class TestHDF5Format extends ClusterTest {
@@ -908,29 +895,4 @@ public class TestHDF5Format extends ClusterTest {
 
     new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
   }
-
-  /**
-   * Generates a compressed file for testing
-   * @param fileName the input file to be compressed
-   * @param codecName the CODEC to be used for compression
-   * @param outFileName the output file name
-   * @throws IOException Throws IO exception if the file cannot be found or any other IO error
-   */
-  private void generateCompressedFile(String fileName, String codecName, String outFileName) throws IOException {
-    FileSystem fs = ExecTest.getLocalFileSystem();
-    Configuration conf = fs.getConf();
-    conf.set(CommonConfigurationKeys.IO_COMPRESSION_CODECS_KEY, ZipCodec.class.getCanonicalName());
-    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
-
-    CompressionCodec codec = factory.getCodecByName(codecName);
-    assertNotNull(codecName + " is not found", codec);
-
-    Path outFile = new Path(dirTestWatcher.getRootDir().getAbsolutePath(), outFileName);
-    Path inFile = new Path(dirTestWatcher.getRootDir().getAbsolutePath(), fileName);
-
-    try (InputStream inputStream = new FileInputStream(inFile.toUri().toString());
-         OutputStream outputStream = codec.createOutputStream(fs.create(outFile))) {
-      IOUtils.copyBytes(inputStream, outputStream, fs.getConf(), false);
-    }
-  }
 }
diff --git a/contrib/format-spss/README.md b/contrib/format-spss/README.md
new file mode 100644
index 0000000..75aad3f
--- /dev/null
+++ b/contrib/format-spss/README.md
@@ -0,0 +1,93 @@
+# Format Plugin for SPSS (SAV) Files
+This format plugin enables Apache Drill to read and query Statistical Package for the Social Sciences 
+(SPSS) (or Statistical Product and Service Solutions) data files. According 
+to Wikipedia: (https://en.wikipedia.org/wiki/SPSS)
+ ***
+ SPSS is a widely used program for statistical analysis in social science. It is also used by market 
+ researchers, health researchers, survey companies, government, education researchers, marketing 
+ organizations, data miners, and others. The original SPSS manual (Nie, Bent & Hull, 1970) has been 
+ described as one of "sociology's most influential books" for allowing ordinary researchers to do their 
+ own statistical analysis. In addition to statistical analysis, data management (case selection, file 
+ reshaping, creating derived data) and data documentation (a metadata dictionary is stored in the 
+ datafile) are features of the base software.
+ ***
+ 
+## Configuration 
+To configure Drill to read SPSS files, simply add the following code to the formats section of your 
+file-based storage plugin.  This should happen automatically for the default
+ `cp`, `dfs`, and `S3` storage plugins.
+ 
+Other than the file extensions, there are no variables to configure.
+ 
+```json
+"spss": {         
+  "type": "spss",
+  "extensions": ["sav"]
+ }
+```
+
+## Data Model
+SPSS only supports two data types: Numeric and Strings.  Drill maps these to `DOUBLE` and `VARCHAR` 
+respectively. However, for some numeric columns, SPSS maps these numbers to 
+text, similar to an `enum` field in Java.
+ 
+For instance, a field called `Survey` might have labels as shown below:
+ 
+ <table>
+    <tr>
+        <th>Value</th>
+        <th>Text</th>
+    </tr>
+    <tr>
+        <td>1</td>
+        <td>Yes</td>
+    </tr>
+    <tr>
+        <td>2</td>
+        <td>No</td>
+    </tr>
+    <tr>
+        <td>99</td>
+        <td>No Answer</td>
+    </tr>
+ </table>
+
+For situations like this, Drill will create two columns. In the example above you would get a column 
+called `Survey` which has the numeric value (1,2 or 99) as well as a column called `Survey_value` which 
+will map the integer to the appropriate value. Thus, the results would look something like this:
+ 
+ <table>
+    <tr>
+        <th>`Survey`</th>
+        <th>`Survey_value`</th>
+    </tr>
+    <tr>
+        <td>1</td>
+        <td>Yes</td>
+    </tr>
+    <tr>
+        <td>1</td>
+        <td>Yes</td>
+    </tr>
+    <tr>
+        <td>1</td>
+        <td>Yes</td>
+    </tr>
+    <tr>
+        <td>2</td>
+        <td>No</td>
+    </tr>
+    <tr>
+        <td>1</td>
+        <td>Yes</td>
+    </tr>
+    <tr>
+        <td>2</td>
+        <td>No</td>
+    </tr>
+    <tr>
+        <td>99</td>
+        <td>No Answer</td>
+    </tr>
+ </table>
+ 
\ No newline at end of file
diff --git a/contrib/format-spss/pom.xml b/contrib/format-spss/pom.xml
new file mode 100644
index 0000000..20cbef4
--- /dev/null
+++ b/contrib/format-spss/pom.xml
@@ -0,0 +1,88 @@
+<?xml version="1.0"?>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <artifactId>drill-contrib-parent</artifactId>
+    <groupId>org.apache.drill.contrib</groupId>
+    <version>1.18.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>drill-format-spss</artifactId>
+  <name>contrib/format-spss</name>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.drill.exec</groupId>
+      <artifactId>drill-java-exec</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.bedatadriven.spss</groupId>
+      <artifactId>spss-reader</artifactId>
+      <version>1.3</version>
+    </dependency>
+
+    <!-- Test dependencies -->
+    <dependency>
+      <groupId>org.apache.drill.exec</groupId>
+      <artifactId>drill-java-exec</artifactId>
+      <classifier>tests</classifier>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.drill</groupId>
+      <artifactId>drill-common</artifactId>
+      <classifier>tests</classifier>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-resources-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>copy-java-sources</id>
+            <phase>process-sources</phase>
+            <goals>
+              <goal>copy-resources</goal>
+            </goals>
+            <configuration>
+              <outputDirectory>${basedir}/target/classes/org/apache/drill/exec/store/spss
+              </outputDirectory>
+              <resources>
+                <resource>
+                  <directory>src/main/java/org/apache/drill/exec/store/spss</directory>
+                  <filtering>true</filtering>
+                </resource>
+              </resources>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssBatchReader.java b/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssBatchReader.java
new file mode 100644
index 0000000..8d6ba99
--- /dev/null
+++ b/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssBatchReader.java
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.spss;
+
+import com.bedatadriven.spss.SpssDataFileReader;
+import com.bedatadriven.spss.SpssVariable;
+import org.apache.drill.common.AutoCloseables;
+import org.apache.drill.common.exceptions.CustomErrorContext;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
+import org.apache.drill.exec.physical.resultSet.ResultSetLoader;
+import org.apache.drill.exec.physical.resultSet.RowSetLoader;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.exec.vector.accessor.ScalarWriter;
+import org.apache.hadoop.mapred.FileSplit;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class SpssBatchReader implements ManagedReader<FileSchemaNegotiator> {
+
+  private static final Logger logger = LoggerFactory.getLogger(SpssBatchReader.class);
+
+  private static final String VALUE_LABEL = "_value";
+
+  private FileSplit split;
+
+  private InputStream fsStream;
+
+  private SpssDataFileReader spssReader;
+
+  private RowSetLoader rowWriter;
+
+  private List<SpssVariable> variableList;
+
+  private List<SpssColumnWriter> writerList;
+
+  private CustomErrorContext errorContext;
+
+
+  public static class SpssReaderConfig {
+
+    protected final SpssFormatPlugin plugin;
+
+    public SpssReaderConfig(SpssFormatPlugin plugin) {
+      this.plugin = plugin;
+    }
+  }
+
+  @Override
+  public boolean open(FileSchemaNegotiator negotiator) {
+    split = negotiator.split();
+    openFile(negotiator);
+    negotiator.tableSchema(buildSchema(), true);
+    errorContext = negotiator.parentErrorContext();
+    ResultSetLoader loader = negotiator.build();
+    rowWriter = loader.writer();
+    buildReaderList();
+
+    return true;
+  }
+
+  @Override
+  public boolean next() {
+    while (!rowWriter.isFull()) {
+      if (!processNextRow()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  @Override
+  public void close() {
+    if (fsStream != null) {
+      AutoCloseables.closeSilently(fsStream);
+      fsStream = null;
+    }
+  }
+
+  private void openFile(FileSchemaNegotiator negotiator) {
+    try {
+      fsStream = negotiator.fileSystem().openPossiblyCompressedStream(split.getPath());
+      spssReader = new SpssDataFileReader(fsStream);
+    } catch (IOException e) {
+      throw UserException
+        .dataReadError(e)
+        .message("Unable to open SPSS File %s", split.getPath())
+        .addContext(e.getMessage())
+        .addContext(errorContext)
+        .build(logger);
+    }
+  }
+
+  private boolean processNextRow() {
+    try {
+      // Stop reading when you run out of data
+      if (!spssReader.readNextCase()) {
+        return false;
+      }
+
+      rowWriter.start();
+      for (SpssColumnWriter spssColumnWriter : writerList) {
+        spssColumnWriter.load(spssReader);
+      }
+      rowWriter.save();
+
+    } catch (IOException e) {
+      throw UserException
+        .dataReadError(e)
+        .message("Error reading SPSS File.")
+        .addContext(errorContext)
+        .build(logger);
+    }
+    return true;
+  }
+
+  private TupleMetadata buildSchema() {
+    SchemaBuilder builder = new SchemaBuilder();
+    variableList = spssReader.getVariables();
+
+    for (SpssVariable variable : variableList) {
+      String varName = variable.getVariableName();
+
+      if (variable.isNumeric()) {
+        builder.addNullable(varName, TypeProtos.MinorType.FLOAT8);
+
+        // Check if the column has lookups associated with it
+        if (variable.getValueLabels() != null && variable.getValueLabels().size() > 0) {
+          builder.addNullable(varName + VALUE_LABEL, TypeProtos.MinorType.VARCHAR);
+        }
+
+      } else {
+        builder.addNullable(varName, TypeProtos.MinorType.VARCHAR);
+      }
+    }
+    return builder.buildSchema();
+  }
+
+  private void buildReaderList() {
+    writerList = new ArrayList<>();
+
+    for (SpssVariable variable : variableList) {
+      if (variable.isNumeric()) {
+        writerList.add(new NumericSpssColumnWriter(variable.getIndex(), variable.getVariableName(), rowWriter, spssReader));
+      } else {
+        writerList.add(new StringSpssColumnWriter(variable.getIndex(), variable.getVariableName(), rowWriter));
+      }
+    }
+  }
+
+  public abstract static class SpssColumnWriter {
+    final String columnName;
+    final ScalarWriter writer;
+    final int columnIndex;
+
+    public SpssColumnWriter(int columnIndex, String columnName, ScalarWriter writer) {
+      this.columnIndex = columnIndex;
+      this.columnName = columnName;
+      this.writer = writer;
+    }
+
+    public abstract void load (SpssDataFileReader reader);
+  }
+
+  public static class StringSpssColumnWriter extends SpssColumnWriter {
+
+    StringSpssColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter) {
+      super(columnIndex, columnName, rowWriter.scalar(columnName));
+    }
+
+    @Override
+    public void load(SpssDataFileReader reader) {
+      writer.setString(reader.getStringValue(columnIndex));
+    }
+  }
+
+  public static class NumericSpssColumnWriter extends SpssColumnWriter {
+
+    ScalarWriter labelWriter;
+    Map<Double, String> labels;
+
+    NumericSpssColumnWriter(int columnIndex, String columnName, RowSetLoader rowWriter, SpssDataFileReader reader) {
+      super(columnIndex, columnName, rowWriter.scalar(columnName));
+
+      if (reader.getValueLabels(columnName) != null && reader.getValueLabels(columnName).size() != 0) {
+        labelWriter = rowWriter.scalar(columnName + VALUE_LABEL);
+        labels = reader.getValueLabels(columnIndex);
+      }
+    }
+
+    @Override
+    public void load(SpssDataFileReader reader) {
+      double value = reader.getDoubleValue(columnIndex);
+
+      if (labelWriter != null) {
+        String labelValue = labels.get(value);
+        if (labelValue == null) {
+          labelWriter.setNull();
+        } else {
+          labelWriter.setString(labelValue);
+        }
+      }
+      writer.setDouble(value);
+    }
+  }
+}
diff --git a/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssFormatConfig.java b/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssFormatConfig.java
new file mode 100644
index 0000000..ad35cd8
--- /dev/null
+++ b/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssFormatConfig.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.spss;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+
+import org.apache.drill.common.PlanStringBuilder;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import org.apache.drill.exec.store.spss.SpssBatchReader.SpssReaderConfig;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+
+@JsonTypeName(SpssFormatPlugin.DEFAULT_NAME)
+@JsonInclude(JsonInclude.Include.NON_DEFAULT)
+public class SpssFormatConfig implements FormatPluginConfig {
+  private final List<String> extensions;
+
+  // Omitted properties take reasonable defaults
+  @JsonCreator
+  public SpssFormatConfig(@JsonProperty("extensions") List<String> extensions) {
+    this.extensions = extensions == null ? Collections.singletonList("sav") : ImmutableList.copyOf(extensions);
+  }
+
+  @JsonInclude(JsonInclude.Include.NON_DEFAULT)
+  public List<String> getExtensions() {
+    return extensions;
+  }
+
+  public SpssReaderConfig getReaderConfig(SpssFormatPlugin plugin) {
+    return new SpssReaderConfig(plugin);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(extensions);
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+    if (obj == null || getClass() != obj.getClass()) {
+      return false;
+    }
+    SpssFormatConfig other = (SpssFormatConfig) obj;
+    return Objects.equals(extensions, other.extensions);
+  }
+
+  @Override
+  public String toString() {
+    return new PlanStringBuilder(this)
+      .field("extensions", extensions)
+      .toString();
+  }
+}
diff --git a/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssFormatPlugin.java b/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssFormatPlugin.java
new file mode 100644
index 0000000..5e780b2
--- /dev/null
+++ b/contrib/format-spss/src/main/java/org/apache/drill/exec/store/spss/SpssFormatPlugin.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.spss;
+
+import org.apache.drill.common.logical.StoragePluginConfig;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.common.types.Types;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileReaderFactory;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileScanBuilder;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator;
+
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
+import org.apache.drill.exec.proto.UserBitShared;
+import org.apache.drill.exec.server.DrillbitContext;
+import org.apache.drill.exec.server.options.OptionManager;
+import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin;
+import org.apache.drill.exec.store.dfs.easy.EasySubScan;
+import org.apache.hadoop.conf.Configuration;
+
+
+public class SpssFormatPlugin extends EasyFormatPlugin<SpssFormatConfig> {
+
+  protected static final String DEFAULT_NAME = "spss";
+
+  private static class SpssReaderFactory extends FileReaderFactory {
+
+    @Override
+    public ManagedReader<? extends FileSchemaNegotiator> newReader() {
+      return new SpssBatchReader();
+    }
+  }
+
+  public SpssFormatPlugin(String name, DrillbitContext context,
+                           Configuration fsConf, StoragePluginConfig storageConfig,
+                           SpssFormatConfig formatConfig) {
+    super(name, easyConfig(fsConf, formatConfig), context, storageConfig, formatConfig);
+  }
+
+  private static EasyFormatConfig easyConfig(Configuration fsConf, SpssFormatConfig pluginConfig) {
+    EasyFormatConfig config = new EasyFormatConfig();
+    config.readable = true;
+    config.writable = false;
+    config.blockSplittable = false;
+    config.compressible = true;
+    config.supportsProjectPushdown = true;
+    config.extensions = pluginConfig.getExtensions();
+    config.fsConf = fsConf;
+    config.defaultName = DEFAULT_NAME;
+    config.readerOperatorType = UserBitShared.CoreOperatorType.SPSS_SUB_SCAN_VALUE;
+    config.useEnhancedScan = true;
+    return config;
+  }
+
+  @Override
+  public ManagedReader<? extends FileSchemaNegotiator> newBatchReader(
+    EasySubScan scan, OptionManager options)  {
+    return new SpssBatchReader();
+  }
+
+  @Override
+  protected FileScanBuilder frameworkBuilder(OptionManager options, EasySubScan scan) {
+    FileScanBuilder builder = new FileScanBuilder();
+    builder.setReaderFactory(new SpssReaderFactory());
+
+    initScanBuilder(builder, scan);
+    builder.nullType(Types.optional(TypeProtos.MinorType.VARCHAR));
+    return builder;
+  }
+}
diff --git a/contrib/format-spss/src/main/resources/bootstrap-format-plugins.json b/contrib/format-spss/src/main/resources/bootstrap-format-plugins.json
new file mode 100644
index 0000000..7d4f250
--- /dev/null
+++ b/contrib/format-spss/src/main/resources/bootstrap-format-plugins.json
@@ -0,0 +1,37 @@
+{
+  "storage": {
+    "dfs": {
+      "type": "file",
+      "formats": {
+        "spss": {
+          "type": "spss",
+          "extensions": [
+            "sav"
+          ]
+        }
+      }
+    },
+    "cp": {
+      "type": "file",
+      "formats": {
+        "spss": {
+          "type": "spss",
+          "extensions": [
+            "sav"
+          ]
+        }
+      }
+    },
+    "s3": {
+      "type": "file",
+      "formats": {
+        "spss": {
+          "type": "spss",
+          "extensions": [
+            "sav"
+          ]
+        }
+      }
+    }
+  }
+}
diff --git a/contrib/format-spss/src/main/resources/drill-module.conf b/contrib/format-spss/src/main/resources/drill-module.conf
new file mode 100644
index 0000000..2e35ff7
--- /dev/null
+++ b/contrib/format-spss/src/main/resources/drill-module.conf
@@ -0,0 +1,23 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#  This file tells Drill to consider this module when class path scanning.
+#  This file can also include any supplementary configuration information.
+#  This file is in HOCON format, see https://github.com/typesafehub/config/blob/master/HOCON.md for more information.
+
+drill.classpath.scanning.packages += "org.apache.drill.exec.store.spss"
diff --git a/contrib/format-spss/src/test/java/org/apache/drill/exec/store/spss/TestSpssReader.java b/contrib/format-spss/src/test/java/org/apache/drill/exec/store/spss/TestSpssReader.java
new file mode 100644
index 0000000..b54c4f8
--- /dev/null
+++ b/contrib/format-spss/src/test/java/org/apache/drill/exec/store/spss/TestSpssReader.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.spss;
+
+import org.apache.drill.categories.RowSetTests;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.exec.physical.rowSet.RowSet;
+import org.apache.drill.exec.physical.rowSet.RowSetBuilder;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterTest;
+import org.apache.drill.test.QueryBuilder;
+import org.apache.drill.test.rowSet.RowSetComparison;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.nio.file.Paths;
+
+import static org.junit.Assert.assertEquals;
+import static org.apache.drill.test.QueryTestUtil.generateCompressedFile;
+
+@Category(RowSetTests.class)
+public class TestSpssReader extends ClusterTest {
+
+  @BeforeClass
+  public static void setup() throws Exception {
+    ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher));
+
+    // Needed for compressed file unit test
+    dirTestWatcher.copyResourceToRoot(Paths.get("spss/"));
+  }
+
+  @Test
+  public void testStarQuery() throws Exception {
+    String sql = "SELECT * FROM dfs.`spss/testdata.sav` WHERE d16=4";
+
+    QueryBuilder q = client.queryBuilder().sql(sql);
+    RowSet results = q.rowSet();
+
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .addNullable("ID", TypeProtos.MinorType.FLOAT8)
+      .addNullable("Urban", TypeProtos.MinorType.FLOAT8)
+      .addNullable("Urban_value", TypeProtos.MinorType.VARCHAR)
+      .addNullable("District", TypeProtos.MinorType.FLOAT8)
+      .addNullable("District_value", TypeProtos.MinorType.VARCHAR)
+      .addNullable("Province", TypeProtos.MinorType.FLOAT8)
+      .addNullable("Province_value", TypeProtos.MinorType.VARCHAR)
+      .addNullable("Interviewer", TypeProtos.MinorType.FLOAT8)
+      .addNullable("Date", TypeProtos.MinorType.FLOAT8)
+      .addNullable("d6_1", TypeProtos.MinorType.FLOAT8)
+      .addNullable("d6_1_Value", TypeProtos.MinorType.VARCHAR)
+      .addNullable("d6_2", TypeProtos.MinorType.FLOAT8)
+      .addNullable("d6_2_Value", TypeProtos.MinorType.VARCHAR)
+      .addNullable("d6_3", TypeProtos.MinorType.FLOAT8)
+      .addNullable("d6_3_Value", TypeProtos.MinorType.VARCHAR)
+      .addNullable("d6_4", TypeProtos.MinorType.FLOAT8)
+      .addNullable("d6_4_Value", TypeProtos.MinorType.VARCHAR)
+      .addNullable("s_1", TypeProtos.MinorType.VARCHAR)
+      .addNullable("d6_5", TypeProtos.MinorType.FLOAT8)
+      .addNullable("d6_5_Value", TypeProtos.MinorType.VARCHAR)
+      .addNullable("d6_6", TypeProtos.MinorType.FLOAT8)
+      .addNullable("d6_6_Value", TypeProtos.MinorType.VARCHAR)
+      .addNullable("d6_7", TypeProtos.MinorType.FLOAT8)
+      .addNullable("d6_7_Value", TypeProtos.MinorType.VARCHAR)
+      .addNullable("q1", TypeProtos.MinorType.FLOAT8)
+      .addNullable("q1_Value", TypeProtos.MinorType.VARCHAR)
+      .addNullable("q2", TypeProtos.MinorType.FLOAT8)
+      .addNullable("q2_Value", TypeProtos.MinorType.VARCHAR)
+      .addNullable("d7a", TypeProtos.MinorType.FLOAT8)
+      .addNullable("d7a_Value", TypeProtos.MinorType.VARCHAR)
+      .addNullable("d7b", TypeProtos.MinorType.FLOAT8)
+      .addNullable("d7b_Value", TypeProtos.MinorType.VARCHAR)
+      .addNullable("d16", TypeProtos.MinorType.FLOAT8)
+      .addNullable("Stratum", TypeProtos.MinorType.FLOAT8)
+      .addNullable("S1_IP", TypeProtos.MinorType.FLOAT8)
+      .addNullable("S2_IP", TypeProtos.MinorType.FLOAT8)
+      .addNullable("Sample_Weight", TypeProtos.MinorType.FLOAT8)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(47.0, 1.0, "Urban", 101.0, "Kabul", 1.0, "Kabul", 151.0, 1.34557632E10, 1.0, "Yes", 2.0, "No", 2.0, "No", 2.0, "No", "", 2.0, "No", 2.0, "No", 2.0, "No", 1.0, "Good", 2.0, "The same", 5.0, "Housewife (not working outside of the home)", 97.0, "Not Asked", 4.0, 121.0, 0.007463305415042708, 0.006666666666666667, 20098.33333333333)
+      .addRow(53.0, 1.0, "Urban", 101.0, "Kabul", 1.0, "Kabul", 151.0, 1.34557632E10, 1.0, "Yes", 2.0, "No", 2.0, "No", 2.0, "No", "", 2.0, "No", 2.0, "No", 2.0, "No", 1.0, "Good", 2.0, "The same", 5.0, "Housewife (not working outside of the home)", 97.0, "Not Asked", 4.0, 121.0, 0.007463305415042708, 0.006666666666666667, 20098.33333333333)
+      .addRow(66.0, 1.0, "Urban", 101.0, "Kabul", 1.0, "Kabul", 774.0, 1.34556768E10, 2.0, "No", 1.0, "Yes", 1.0, "Yes", 2.0, "No", "", 2.0, "No", 2.0, "No", 2.0, "No", 1.0, "Good", 1.0, "Better", 1.0, "Working full time", 13.0, "Private Business Sole Proprietor", 4.0, 111.0, 0.017389288198469743, 0.006666666666666667, 8626.0)
+      .build();
+
+    assertEquals(3, results.rowCount());
+
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+
+  @Test
+  public void testExplicitQuery() throws Exception {
+    String sql = "SELECT ID, Urban, Urban_value FROM dfs.`spss/testdata.sav` WHERE d16=4";
+
+    QueryBuilder q = client.queryBuilder().sql(sql);
+    RowSet results = q.rowSet();
+
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .addNullable("ID", TypeProtos.MinorType.FLOAT8)
+      .addNullable("Urban", TypeProtos.MinorType.FLOAT8)
+      .addNullable("Urban_value", TypeProtos.MinorType.VARCHAR)
+      .buildSchema();
+
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(47.0, 1.0, "Urban").addRow(53.0, 1.0, "Urban")
+      .addRow(66.0, 1.0, "Urban")
+      .build();
+
+    assertEquals(3, results.rowCount());
+
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+
+  @Test
+  public void testSerDe() throws Exception {
+    String sql = "SELECT COUNT(*) FROM dfs.`spss/testdata.sav`";
+    String plan = queryBuilder().sql(sql).explainJson();
+    long cnt = queryBuilder().physical(plan).singletonLong();
+    assertEquals("Counts should match", 25L, cnt);
+  }
+
+  @Test
+  public void testExplicitQueryWithCompressedFile() throws Exception {
+    generateCompressedFile("spss/testdata.sav", "zip", "spss/testdata.sav.zip");
+
+    String sql = "SELECT ID, Urban, Urban_value FROM dfs.`spss/testdata.sav.zip`  WHERE d16=4";
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .addNullable("ID", TypeProtos.MinorType.FLOAT8)
+      .addNullable("Urban", TypeProtos.MinorType.FLOAT8)
+      .addNullable("Urban_value", TypeProtos.MinorType.VARCHAR)
+      .buildSchema();
+
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(47.0, 1.0, "Urban").addRow(53.0, 1.0, "Urban")
+      .addRow(66.0, 1.0, "Urban")
+      .build();
+
+    assertEquals(3, results.rowCount());
+
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+}
diff --git a/contrib/format-spss/src/test/resources/spss/testdata.sav b/contrib/format-spss/src/test/resources/spss/testdata.sav
new file mode 100644
index 0000000..0351d9f
Binary files /dev/null and b/contrib/format-spss/src/test/resources/spss/testdata.sav differ
diff --git a/contrib/native/client/src/protobuf/UserBitShared.pb.cc b/contrib/native/client/src/protobuf/UserBitShared.pb.cc
index ed13f3e..bac074a 100644
--- a/contrib/native/client/src/protobuf/UserBitShared.pb.cc
+++ b/contrib/native/client/src/protobuf/UserBitShared.pb.cc
@@ -956,7 +956,7 @@ const char descriptor_table_protodef_UserBitShared_2eproto[] PROTOBUF_SECTION_VA
   "ATEMENT\020\005*\207\001\n\rFragmentState\022\013\n\007SENDING\020\000"
   "\022\027\n\023AWAITING_ALLOCATION\020\001\022\013\n\007RUNNING\020\002\022\014"
   "\n\010FINISHED\020\003\022\r\n\tCANCELLED\020\004\022\n\n\006FAILED\020\005\022"
-  "\032\n\026CANCELLATION_REQUESTED\020\006*\367\n\n\020CoreOper"
+  "\032\n\026CANCELLATION_REQUESTED\020\006*\212\013\n\020CoreOper"
   "atorType\022\021\n\rSINGLE_SENDER\020\000\022\024\n\020BROADCAST"
   "_SENDER\020\001\022\n\n\006FILTER\020\002\022\022\n\016HASH_AGGREGATE\020"
   "\003\022\r\n\tHASH_JOIN\020\004\022\016\n\nMERGE_JOIN\020\005\022\031\n\025HASH"
@@ -991,11 +991,12 @@ const char descriptor_table_protodef_UserBitShared_2eproto[] PROTOBUF_SECTION_VA
   "MERGE\020=\022\021\n\rLTSV_SUB_SCAN\020>\022\021\n\rHDF5_SUB_S"
   "CAN\020\?\022\022\n\016EXCEL_SUB_SCAN\020@\022\020\n\014SHP_SUB_SCA"
   "N\020A\022\024\n\020METADATA_HANDLER\020B\022\027\n\023METADATA_CO"
-  "NTROLLER\020C\022\021\n\rHTTP_SUB_SCAN\020F*g\n\nSaslSta"
-  "tus\022\020\n\014SASL_UNKNOWN\020\000\022\016\n\nSASL_START\020\001\022\024\n"
-  "\020SASL_IN_PROGRESS\020\002\022\020\n\014SASL_SUCCESS\020\003\022\017\n"
-  "\013SASL_FAILED\020\004B.\n\033org.apache.drill.exec."
-  "protoB\rUserBitSharedH\001"
+  "NTROLLER\020C\022\021\n\rSPSS_SUB_SCAN\020E\022\021\n\rHTTP_SU"
+  "B_SCAN\020F*g\n\nSaslStatus\022\020\n\014SASL_UNKNOWN\020\000"
+  "\022\016\n\nSASL_START\020\001\022\024\n\020SASL_IN_PROGRESS\020\002\022\020"
+  "\n\014SASL_SUCCESS\020\003\022\017\n\013SASL_FAILED\020\004B.\n\033org"
+  ".apache.drill.exec.protoB\rUserBitSharedH"
+  "\001"
   ;
 static const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable*const descriptor_table_UserBitShared_2eproto_deps[3] = {
   &::descriptor_table_Coordination_2eproto,
@@ -1029,7 +1030,7 @@ static ::PROTOBUF_NAMESPACE_ID::internal::SCCInfoBase*const descriptor_table_Use
 static ::PROTOBUF_NAMESPACE_ID::internal::once_flag descriptor_table_UserBitShared_2eproto_once;
 static bool descriptor_table_UserBitShared_2eproto_initialized = false;
 const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_UserBitShared_2eproto = {
-  &descriptor_table_UserBitShared_2eproto_initialized, descriptor_table_protodef_UserBitShared_2eproto, "UserBitShared.proto", 5782,
+  &descriptor_table_UserBitShared_2eproto_initialized, descriptor_table_protodef_UserBitShared_2eproto, "UserBitShared.proto", 5801,
   &descriptor_table_UserBitShared_2eproto_once, descriptor_table_UserBitShared_2eproto_sccs, descriptor_table_UserBitShared_2eproto_deps, 22, 3,
   schemas, file_default_instances, TableStruct_UserBitShared_2eproto::offsets,
   file_level_metadata_UserBitShared_2eproto, 22, file_level_enum_descriptors_UserBitShared_2eproto, file_level_service_descriptors_UserBitShared_2eproto,
@@ -1265,6 +1266,7 @@ bool CoreOperatorType_IsValid(int value) {
     case 65:
     case 66:
     case 67:
+    case 69:
     case 70:
       return true;
     default:
diff --git a/contrib/native/client/src/protobuf/UserBitShared.pb.h b/contrib/native/client/src/protobuf/UserBitShared.pb.h
index 0cc48d8..95cdb20 100644
--- a/contrib/native/client/src/protobuf/UserBitShared.pb.h
+++ b/contrib/native/client/src/protobuf/UserBitShared.pb.h
@@ -390,6 +390,7 @@ enum CoreOperatorType : int {
   SHP_SUB_SCAN = 65,
   METADATA_HANDLER = 66,
   METADATA_CONTROLLER = 67,
+  SPSS_SUB_SCAN = 69,
   HTTP_SUB_SCAN = 70
 };
 bool CoreOperatorType_IsValid(int value);
diff --git a/contrib/pom.xml b/contrib/pom.xml
index 15f3dd5..d81f717 100644
--- a/contrib/pom.xml
+++ b/contrib/pom.xml
@@ -48,6 +48,7 @@
     <module>format-excel</module>
     <module>format-esri</module>
     <module>format-hdf5</module>
+    <module>format-spss</module>
     <module>storage-hive</module>
     <module>storage-mongo</module>
     <module>storage-jdbc</module>
diff --git a/distribution/pom.xml b/distribution/pom.xml
index ddffb44..8b8369a 100644
--- a/distribution/pom.xml
+++ b/distribution/pom.xml
@@ -349,6 +349,11 @@
         </dependency>
         <dependency>
           <groupId>org.apache.drill.contrib</groupId>
+          <artifactId>drill-format-spss</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+        <dependency>
+          <groupId>org.apache.drill.contrib</groupId>
           <artifactId>drill-format-ltsv</artifactId>
           <version>${project.version}</version>
         </dependency>
diff --git a/distribution/src/assemble/component.xml b/distribution/src/assemble/component.xml
index 05fd203..01e371e 100644
--- a/distribution/src/assemble/component.xml
+++ b/distribution/src/assemble/component.xml
@@ -47,6 +47,7 @@
         <include>org.apache.drill.contrib:drill-format-hdf5:jar</include>
         <include>org.apache.drill.contrib:drill-format-ltsv:jar</include>
         <include>org.apache.drill.contrib:drill-format-excel:jar</include>
+        <include>org.apache.drill.contrib:drill-format-spss:jar</include>
         <include>org.apache.drill.contrib:drill-jdbc-storage:jar</include>
         <include>org.apache.drill.contrib:drill-kudu-storage:jar</include>
         <include>org.apache.drill.contrib:drill-storage-kafka:jar</include>
diff --git a/exec/java-exec/src/test/java/org/apache/drill/test/ClusterTest.java b/exec/java-exec/src/test/java/org/apache/drill/test/ClusterTest.java
index 7ef6198..43f1396 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/test/ClusterTest.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/test/ClusterTest.java
@@ -17,12 +17,12 @@
  */
 package org.apache.drill.test;
 
-import java.io.IOException;
-
 import org.apache.drill.common.AutoCloseables;
 import org.junit.AfterClass;
 import org.junit.ClassRule;
 
+import java.io.IOException;
+
 /**
  * Base class for tests that use a single cluster fixture for a set of
  * tests. Extend your test case directly from {@link DrillTest} if you
diff --git a/exec/java-exec/src/test/java/org/apache/drill/test/QueryTestUtil.java b/exec/java-exec/src/test/java/org/apache/drill/test/QueryTestUtil.java
index 0d3e436..86ec9ca 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/test/QueryTestUtil.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/test/QueryTestUtil.java
@@ -17,7 +17,10 @@
  */
 package org.apache.drill.test;
 
+import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
 import java.net.BindException;
 import java.net.ServerSocket;
 import java.util.List;
@@ -27,6 +30,7 @@ import java.util.regex.Pattern;
 
 import org.apache.drill.common.config.DrillConfig;
 import org.apache.drill.exec.ExecConstants;
+import org.apache.drill.exec.ExecTest;
 import org.apache.drill.exec.client.DrillClient;
 import org.apache.drill.exec.client.LoggingResultsListener;
 import org.apache.drill.exec.client.QuerySubmitter.Format;
@@ -43,10 +47,21 @@ import org.apache.drill.exec.server.RemoteServiceSet;
 import org.apache.drill.exec.server.options.OptionManager;
 import org.apache.drill.exec.server.options.OptionValue;
 import org.apache.drill.exec.server.options.SystemOptionManager;
+import org.apache.drill.exec.store.dfs.ZipCodec;
 import org.apache.drill.exec.util.VectorUtil;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static org.apache.drill.test.ClusterTest.dirTestWatcher;
+import static org.junit.Assert.assertNotNull;
+
 /**
  * Utilities useful for tests that issue SQL queries.
  */
@@ -312,4 +327,29 @@ public class QueryTestUtil {
     throw new BindException(String.format("Free port could not be found in the range [%s-%s].\n" +
         "Please release any of used ports in this range.", portNumber, portNumber + numberOfAttempts));
   }
+
+  /**
+   * Generates a compressed version of the file for testing
+   * @param fileName Name of the input file
+   * @param codecName The desired CODEC to be used.
+   * @param outFileName Name of generated compressed file
+   * @throws IOException If function cannot generate file, throws IOException
+   */
+  public static void generateCompressedFile(String fileName, String codecName, String outFileName) throws IOException {
+    FileSystem fs = ExecTest.getLocalFileSystem();
+    Configuration conf = fs.getConf();
+    conf.set(CommonConfigurationKeys.IO_COMPRESSION_CODECS_KEY, ZipCodec.class.getCanonicalName());
+    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
+
+    CompressionCodec codec = factory.getCodecByName(codecName);
+    assertNotNull(codecName + " is not found", codec);
+
+    Path outFile = new Path(dirTestWatcher.getRootDir().getAbsolutePath(), outFileName);
+    Path inFile = new Path(dirTestWatcher.getRootDir().getAbsolutePath(), fileName);
+
+    try (InputStream inputStream = new FileInputStream(inFile.toUri().toString());
+         OutputStream outputStream = codec.createOutputStream(fs.create(outFile))) {
+      IOUtils.copyBytes(inputStream, outputStream, fs.getConf(), false);
+    }
+  }
 }
diff --git a/pom.xml b/pom.xml
index 9d99fc5..5329d58 100644
--- a/pom.xml
+++ b/pom.xml
@@ -170,7 +170,7 @@
     <repository>
       <id>conjars</id>
       <name>Conjars</name>
-      <url>http://conjars.org/repo</url>
+      <url>https://conjars.org/repo</url>
       <layout>default</layout>
       <releases>
         <enabled>true</enabled>
@@ -359,6 +359,7 @@
             <exclude>**/*.pcap</exclude>
             <exclude>**/*.log1</exclude>
             <exclude>**/*.log2</exclude>
+            <exclude>**/*.sav</exclude>
             <exclude>**/*.h5</exclude>
             <exclude>**/*.sqllog</exclude>
             <exclude>**/*.sqllog2</exclude>
@@ -665,6 +666,7 @@
               <exclude>**/*.woff2</exclude>
               <exclude>**/*.ks</exclude>
               <exclude>**/*.pcap</exclude>
+              <exclude>**/*.sav</exclude>
               <exclude>**/*.log1</exclude>
               <exclude>**/*.log2</exclude>
               <exclude>**/*.h5</exclude>
diff --git a/protocol/src/main/java/org/apache/drill/exec/proto/UserBitShared.java b/protocol/src/main/java/org/apache/drill/exec/proto/UserBitShared.java
index 44906c3..1510a46 100644
--- a/protocol/src/main/java/org/apache/drill/exec/proto/UserBitShared.java
+++ b/protocol/src/main/java/org/apache/drill/exec/proto/UserBitShared.java
@@ -686,6 +686,10 @@ public final class UserBitShared {
      */
     METADATA_CONTROLLER(67),
     /**
+     * <code>SPSS_SUB_SCAN = 69;</code>
+     */
+    SPSS_SUB_SCAN(69),
+    /**
      * <code>HTTP_SUB_SCAN = 70;</code>
      */
     HTTP_SUB_SCAN(70),
@@ -964,6 +968,10 @@ public final class UserBitShared {
      */
     public static final int METADATA_CONTROLLER_VALUE = 67;
     /**
+     * <code>SPSS_SUB_SCAN = 69;</code>
+     */
+    public static final int SPSS_SUB_SCAN_VALUE = 69;
+    /**
      * <code>HTTP_SUB_SCAN = 70;</code>
      */
     public static final int HTTP_SUB_SCAN_VALUE = 70;
@@ -1057,6 +1065,7 @@ public final class UserBitShared {
         case 65: return SHP_SUB_SCAN;
         case 66: return METADATA_HANDLER;
         case 67: return METADATA_CONTROLLER;
+        case 69: return SPSS_SUB_SCAN;
         case 70: return HTTP_SUB_SCAN;
         default: return null;
       }
@@ -29037,7 +29046,7 @@ public final class UserBitShared {
       "ATEMENT\020\005*\207\001\n\rFragmentState\022\013\n\007SENDING\020\000" +
       "\022\027\n\023AWAITING_ALLOCATION\020\001\022\013\n\007RUNNING\020\002\022\014" +
       "\n\010FINISHED\020\003\022\r\n\tCANCELLED\020\004\022\n\n\006FAILED\020\005\022" +
-      "\032\n\026CANCELLATION_REQUESTED\020\006*\367\n\n\020CoreOper" +
+      "\032\n\026CANCELLATION_REQUESTED\020\006*\212\013\n\020CoreOper" +
       "atorType\022\021\n\rSINGLE_SENDER\020\000\022\024\n\020BROADCAST" +
       "_SENDER\020\001\022\n\n\006FILTER\020\002\022\022\n\016HASH_AGGREGATE\020" +
       "\003\022\r\n\tHASH_JOIN\020\004\022\016\n\nMERGE_JOIN\020\005\022\031\n\025HASH" +
@@ -29072,11 +29081,12 @@ public final class UserBitShared {
       "MERGE\020=\022\021\n\rLTSV_SUB_SCAN\020>\022\021\n\rHDF5_SUB_S" +
       "CAN\020?\022\022\n\016EXCEL_SUB_SCAN\020@\022\020\n\014SHP_SUB_SCA" +
       "N\020A\022\024\n\020METADATA_HANDLER\020B\022\027\n\023METADATA_CO" +
-      "NTROLLER\020C\022\021\n\rHTTP_SUB_SCAN\020F*g\n\nSaslSta" +
-      "tus\022\020\n\014SASL_UNKNOWN\020\000\022\016\n\nSASL_START\020\001\022\024\n" +
-      "\020SASL_IN_PROGRESS\020\002\022\020\n\014SASL_SUCCESS\020\003\022\017\n" +
-      "\013SASL_FAILED\020\004B.\n\033org.apache.drill.exec." +
-      "protoB\rUserBitSharedH\001"
+      "NTROLLER\020C\022\021\n\rSPSS_SUB_SCAN\020E\022\021\n\rHTTP_SU" +
+      "B_SCAN\020F*g\n\nSaslStatus\022\020\n\014SASL_UNKNOWN\020\000" +
+      "\022\016\n\nSASL_START\020\001\022\024\n\020SASL_IN_PROGRESS\020\002\022\020" +
+      "\n\014SASL_SUCCESS\020\003\022\017\n\013SASL_FAILED\020\004B.\n\033org" +
+      ".apache.drill.exec.protoB\rUserBitSharedH" +
+      "\001"
     };
     descriptor = com.google.protobuf.Descriptors.FileDescriptor
       .internalBuildGeneratedFileFrom(descriptorData,
diff --git a/protocol/src/main/protobuf/UserBitShared.proto b/protocol/src/main/protobuf/UserBitShared.proto
index c51cc66..3b99255 100644
--- a/protocol/src/main/protobuf/UserBitShared.proto
+++ b/protocol/src/main/protobuf/UserBitShared.proto
@@ -379,6 +379,7 @@ enum CoreOperatorType {
   SHP_SUB_SCAN = 65;
   METADATA_HANDLER = 66;
   METADATA_CONTROLLER = 67;
+  SPSS_SUB_SCAN = 69;
   HTTP_SUB_SCAN = 70;
 }
 


Mime
View raw message