orc-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From omal...@apache.org
Subject orc git commit: ORC-219: Boolean and timestamp converter for CSV.
Date Mon, 07 Aug 2017 16:28:30 GMT
Repository: orc
Updated Branches:
  refs/heads/master 74200d8c2 -> 268fccb39


ORC-219: Boolean and timestamp converter for CSV.

Added support for boolean and timestamp readers for CSV conversion.
Tested CSV convert with Posgres TSV output that has boolean and timestamp

Fixes #144

Signed-off-by: Owen O'Malley <omalley@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/268fccb3
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/268fccb3
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/268fccb3

Branch: refs/heads/master
Commit: 268fccb397bc2f2756f5174d135394c51897f6e9
Parents: 74200d8
Author: Seshu Pasam <spasam@uptycs.com>
Authored: Fri Aug 4 19:40:38 2017 -0400
Committer: Owen O'Malley <omalley@apache.org>
Committed: Mon Aug 7 09:27:56 2017 -0700

----------------------------------------------------------------------
 .../org/apache/orc/tools/convert/CsvReader.java | 63 +++++++++++++++++++
 .../apache/orc/tools/convert/TestCsvReader.java | 65 +++++++-------------
 2 files changed, 84 insertions(+), 44 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/268fccb3/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java b/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
index 3aa6f1a..1afd4bc 100644
--- a/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
+++ b/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
@@ -7,18 +7,27 @@ import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.orc.RecordReader;
 import org.apache.orc.TypeDescription;
+import org.threeten.bp.LocalDateTime;
+import org.threeten.bp.ZoneId;
+import org.threeten.bp.ZonedDateTime;
+import org.threeten.bp.format.DateTimeFormatter;
+import org.threeten.bp.temporal.TemporalAccessor;
 
 import com.opencsv.CSVReader;
 
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
+import java.sql.Timestamp;
 
 public class CsvReader implements RecordReader {
+  private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern(
+      "yyyy[[-][/]]MM[[-][/]]dd[['T'][ ]]HH:mm:ss[ ][XXX][X]");
 
   private long rowNumber = 0;
   private final Converter converter;
@@ -123,6 +132,28 @@ public class CsvReader implements RecordReader {
     }
   }
 
+  class BooleanConverter extends ConverterImpl {
+    BooleanConverter(IntWritable offset) {
+      super(offset);
+    }
+
+    @Override
+    public void convert(String[] values, ColumnVector column, int row) {
+      if (values[offset] == null || nullString.equals(values[offset])) {
+        column.noNulls = false;
+        column.isNull[row] = true;
+      } else {
+        if (values[offset].equalsIgnoreCase("true")
+            || values[offset].equalsIgnoreCase("t")
+            || values[offset].equals("1")) {
+          ((LongColumnVector) column).vector[row] = 1;
+        } else {
+          ((LongColumnVector) column).vector[row] = 0;
+        }
+      }
+    }
+  }
+
   class LongConverter extends ConverterImpl {
     LongConverter(IntWritable offset) {
       super(offset);
@@ -191,6 +222,35 @@ public class CsvReader implements RecordReader {
     }
   }
 
+  class TimestampConverter extends ConverterImpl {
+    TimestampConverter(IntWritable offset) {
+      super(offset);
+    }
+
+    @Override
+    public void convert(String[] values, ColumnVector column, int row) {
+      if (values[offset] == null || nullString.equals(values[offset])) {
+        column.noNulls = false;
+        column.isNull[row] = true;
+      } else {
+        TimestampColumnVector vector = (TimestampColumnVector) column;
+        TemporalAccessor temporalAccessor =
+            DATE_TIME_FORMATTER.parseBest(values[offset],
+                ZonedDateTime.FROM, LocalDateTime.FROM);
+        if (temporalAccessor instanceof ZonedDateTime) {
+          vector.set(row, new Timestamp(
+              ((ZonedDateTime) temporalAccessor).toEpochSecond() * 1000L));
+        } else if (temporalAccessor instanceof LocalDateTime) {
+          vector.set(row, new Timestamp(((LocalDateTime) temporalAccessor)
+              .atZone(ZoneId.systemDefault()).toEpochSecond() * 1000L));
+        } else {
+          column.noNulls = false;
+          column.isNull[row] = true;
+        }
+      }
+    }
+  }
+
   class StructConverter implements Converter {
     final Converter[] children;
 
@@ -222,6 +282,7 @@ public class CsvReader implements RecordReader {
   Converter buildConverter(IntWritable startOffset, TypeDescription schema) {
     switch (schema.getCategory()) {
       case BOOLEAN:
+        return new BooleanConverter(startOffset);
       case BYTE:
       case SHORT:
       case INT:
@@ -237,6 +298,8 @@ public class CsvReader implements RecordReader {
       case CHAR:
       case VARCHAR:
         return new BytesConverter(startOffset);
+      case TIMESTAMP:
+        return new TimestampConverter(startOffset);
       case STRUCT:
         return new StructConverter(startOffset, schema);
       default:

http://git-wip-us.apache.org/repos/asf/orc/blob/268fccb3/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java
----------------------------------------------------------------------
diff --git a/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java b/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java
index c70dcd7..d385725 100644
--- a/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java
+++ b/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java
@@ -18,54 +18,23 @@
 
 package org.apache.orc.tools.convert;
 
+import static org.junit.Assert.assertEquals;
+
+import java.io.StringReader;
+
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.orc.ColumnStatistics;
-import org.apache.orc.CompressionKind;
-import org.apache.orc.OrcConf;
-import org.apache.orc.OrcFile;
-import org.apache.orc.Reader;
 import org.apache.orc.RecordReader;
-import org.apache.orc.StripeStatistics;
 import org.apache.orc.TypeDescription;
-import org.apache.orc.Writer;
-import org.apache.orc.tools.FileDump;
-import org.apache.orc.tools.TestJsonFileDump;
-import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
-import java.io.BufferedReader;
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.PrintStream;
-import java.io.StringReader;
-import java.sql.Date;
-import java.sql.Timestamp;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assume.assumeTrue;
-
 public class TestCsvReader {
 
   Configuration conf;
@@ -77,28 +46,33 @@ public class TestCsvReader {
 
   @Test
   public void testSimple() throws Exception {
+    // yyyy[[-][/]]MM[[-][/]]dd[['T'][ ]]HH:mm:ss[ ][XXX][X]
     StringReader input = new StringReader(
-        "1,1.25,1.01,'a'\n" +
-        "2,2.5,2.02,'14'\n" +
-        "3,3.75,3.03,'1e'\n" +
-        "4,5,4.04,'28'\n" +
-        "5,6.25,5.05,'32'\n" +
-        "6,7.5,6.06,'3c'\n" +
-        "7,8.75,7.07,'46'\n" +
-        "8,10,8.08,'50'\n"
+        "1,1.25,1.01,'a',f,'2000-01-01T00:00:00+00:00'\n" +
+        "2,2.5,2.02,'14',t,'2000/01/01T00:00:00+00'\n" +
+        "3,3.75,3.03,'1e',false,'2000-01-01T00:00:00Z'\n" +
+        "4,5,4.04,'28',true,'2000-01-01 00:00:00+00'\n" +
+        "5,6.25,5.05,'32',0,'2000-01-01 00:00:00-00'\n" +
+        "6,7.5,6.06,'3c',1,'2000-01-01T04:00:00+04'\n" +
+        "7,8.75,7.07,'46',2,'1999-12-31T20:00:00-04:00'\n" +
+        "8,10,8.08,'50',t,'2000-01-01T00:00:00+00'\n"
     );
     TypeDescription schema = TypeDescription.fromString(
-        "struct<a:int,b:double,c:decimal(10,2),d:string>");
+        "struct<a:int,b:double,c:decimal(10,2),d:string,e:boolean,e:timestamp>");
     RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
         '\\', 0, "");
     VectorizedRowBatch batch = schema.createRowBatch(5);
     assertEquals(true, reader.nextBatch(batch));
     assertEquals(5, batch.size);
+    long bool = 0;
     for(int r = 0; r < batch.size; ++r) {
       assertEquals(r+1, ((LongColumnVector) batch.cols[0]).vector[r]);
       assertEquals(1.25 * (r + 1), ((DoubleColumnVector) batch.cols[1]).vector[r], 0.001);
       assertEquals((r + 1) + ".0" + (r + 1), ((DecimalColumnVector) batch.cols[2]).vector[r].toFormatString(2));
       assertEquals(Integer.toHexString((r + 1) * 10), ((BytesColumnVector) batch.cols[3]).toString(r));
+      assertEquals(bool, ((LongColumnVector) batch.cols[4]).vector[r]);
+      bool = 1 - bool;
+      assertEquals(946684800000L, ((TimestampColumnVector) batch.cols[5]).getTime(r));
     }
     assertEquals(true, reader.nextBatch(batch));
     assertEquals(3, batch.size);
@@ -107,6 +81,9 @@ public class TestCsvReader {
       assertEquals(1.25 * (r + 6), ((DoubleColumnVector) batch.cols[1]).vector[r], 0.001);
       assertEquals((r + 6) + ".0" + (r + 6), ((DecimalColumnVector) batch.cols[2]).vector[r].toFormatString(2));
       assertEquals(Integer.toHexString((r + 6) * 10), ((BytesColumnVector) batch.cols[3]).toString(r));
+      assertEquals(bool, ((LongColumnVector) batch.cols[4]).vector[r]);
+      bool = 1 - bool;
+      assertEquals(946684800000L, ((TimestampColumnVector) batch.cols[5]).getTime(r));
     }
     assertEquals(false, reader.nextBatch(batch));
   }


Mime
View raw message