crunch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mkw...@apache.org
Subject [crunch] branch master updated: CRUNCH-683 avoid unnecessary listStatus() calls from getPathSize() (#26)
Date Fri, 12 Jul 2019 21:30:28 GMT
This is an automated email from the ASF dual-hosted git repository.

mkwhit pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/crunch.git


The following commit(s) were added to refs/heads/master by this push:
     new 8711b2f  CRUNCH-683 avoid unnecessary listStatus() calls from getPathSize() (#26)
8711b2f is described below

commit 8711b2fec4bb3a2b56e39ebaccc316dfa0a0d4eb
Author: Ben Roling <ben.roling@gmail.com>
AuthorDate: Fri Jul 12 16:30:24 2019 -0500

    CRUNCH-683 avoid unnecessary listStatus() calls from getPathSize() (#26)
---
 .../org/apache/crunch/io/SourceTargetHelper.java   |  6 ++-
 .../apache/crunch/io/SourceTargetHelperTest.java   | 50 +++++++++++++++++++---
 2 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/crunch-core/src/main/java/org/apache/crunch/io/SourceTargetHelper.java b/crunch-core/src/main/java/org/apache/crunch/io/SourceTargetHelper.java
index 8fb7065..94b6b87 100644
--- a/crunch-core/src/main/java/org/apache/crunch/io/SourceTargetHelper.java
+++ b/crunch-core/src/main/java/org/apache/crunch/io/SourceTargetHelper.java
@@ -22,7 +22,9 @@ import java.io.IOException;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
 
 /**
  * Functions for configuring the inputs/outputs of MapReduce jobs.
@@ -42,8 +44,8 @@ public class SourceTargetHelper {
     long size = 0;
     for (FileStatus status : stati) {
       if (status.isDir()) {
-        for (FileStatus st : fs.listStatus(status.getPath())) {
-          size += getPathSize(fs, st.getPath());
+        for (RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(status.getPath(),
true); iterator.hasNext();) {
+          size += iterator.next().getLen();
         }
       } else {
         size += status.getLen();
diff --git a/crunch-core/src/test/java/org/apache/crunch/io/SourceTargetHelperTest.java b/crunch-core/src/test/java/org/apache/crunch/io/SourceTargetHelperTest.java
index 434fd10..8c48af6 100644
--- a/crunch-core/src/test/java/org/apache/crunch/io/SourceTargetHelperTest.java
+++ b/crunch-core/src/test/java/org/apache/crunch/io/SourceTargetHelperTest.java
@@ -19,25 +19,30 @@ package org.apache.crunch.io;
 
 import static org.junit.Assert.assertEquals;
 
-import java.io.File;
 import java.io.IOException;
 
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocalFileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RawLocalFileSystem;
+
+import org.junit.Rule;
 import org.junit.Test;
 
 public class SourceTargetHelperTest {
+  @Rule
+  public TemporaryPath tmpDir = TemporaryPaths.create();
 
   @Test
   public void testGetNonexistentPathSize() throws Exception {
-    File tmp = File.createTempFile("pathsize", "");
-    Path tmpPath = new Path(tmp.getAbsolutePath());
-    tmp.delete();
-    FileSystem fs = FileSystem.getLocal(new Configuration(false));
+    Path tmpPath = tmpDir.getRootPath();
+    tmpDir.delete();
+    FileSystem fs = FileSystem.getLocal(tmpDir.getDefaultConfiguration());
     assertEquals(-1L, SourceTargetHelper.getPathSize(fs, tmpPath));
   }
 
@@ -48,6 +53,41 @@ public class SourceTargetHelperTest {
   }
 
   /**
+   * Tests for proper recursive size calculation on a path containing a glob pattern.
+   */
+  @Test
+  public void testGetPathSizeGlobPathRecursive() throws Exception {
+    FileSystem fs = FileSystem.getLocal(tmpDir.getDefaultConfiguration());
+
+    // Create a directory structure with 3 files spread across 2 top-level directories and
one subdirectory:
+    // foo1/file1
+    // foo1/subdir/file2
+    // foo2/file3
+    Path foo1 = tmpDir.getPath("foo1");
+    fs.mkdirs(foo1);
+    createFile(fs, new Path(foo1, "file1"), 3);
+
+    Path subDir = tmpDir.getPath("foo1/subdir");
+    fs.mkdirs(subDir);
+    createFile(fs, new Path(subDir, "file2"), 5);
+
+    Path foo2 = tmpDir.getPath("foo2");
+    fs.mkdirs(foo2);
+    createFile(fs, new Path(foo2, "file3"), 11);
+
+    // assert total size with glob pattern (3 + 5 + 11 = 19)
+    assertEquals(19, SourceTargetHelper.getPathSize(fs, tmpDir.getPath("foo*")));
+  }
+
+  private static void createFile(FileSystem fs, Path path, int size) throws IOException {
+    FSDataOutputStream outputStream = fs.create(path);
+    for (int i = 0; i < size; i++) {
+      outputStream.write(0);
+    }
+    outputStream.close();
+  }
+
+  /**
    * Mock FileSystem that returns null for {@link FileSystem#listStatus(Path)}.
    */
   private static class MockFileSystem extends LocalFileSystem {


Mime
View raw message