datafu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mha...@apache.org
Subject git commit: DATAFU-62 URLInfo UDF and corresponding tests.
Date Mon, 25 Aug 2014 03:17:39 GMT
Repository: incubator-datafu
Updated Branches:
  refs/heads/master 0f9b853be -> b7bef9c88


DATAFU-62 URLInfo UDF and corresponding tests.

https://issues.apache.org/jira/browse/DATAFU-62

Signed-off-by: Matthew Hayes <matthew.terence.hayes@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/b7bef9c8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/b7bef9c8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/b7bef9c8

Branch: refs/heads/master
Commit: b7bef9c888397763b736cc0147f1b8a469a0cd62
Parents: 0f9b853
Author: Joydeep Banerjee <jbanerjee1@gmail.com>
Authored: Sun Aug 10 12:02:59 2014 -0700
Committer: Matthew Hayes <matthew.terence.hayes@gmail.com>
Committed: Sun Aug 24 20:15:47 2014 -0700

----------------------------------------------------------------------
 .../src/main/java/datafu/pig/urls/URLInfo.java  | 148 +++++++++++++++++++
 .../java/datafu/test/pig/urls/URLInfoTest.java  |  72 +++++++++
 2 files changed, 220 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/b7bef9c8/datafu-pig/src/main/java/datafu/pig/urls/URLInfo.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/urls/URLInfo.java b/datafu-pig/src/main/java/datafu/pig/urls/URLInfo.java
new file mode 100644
index 0000000..23ff10a
--- /dev/null
+++ b/datafu-pig/src/main/java/datafu/pig/urls/URLInfo.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.pig.urls;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * Given a valid URL, this UDF provides the following information about that URL:
+ * Domain, Host, Protocol, Path, Port, Query Params and their values
+ */
+public class URLInfo extends EvalFunc<Tuple> {
+
+    private static final int TUPLE_ELEMENTS = 6;
+    private static Pattern pDomain = Pattern.compile(".*?([^.]+\\.[^.]+)");
+
+    private static String getDomain(URL url) {
+        Matcher m = pDomain.matcher(url.getHost());
+        if (m.matches())
+            return m.group(1);
+        else
+            return null;
+    }
+
+    private static String getHost(URL url) {
+        return url.getHost();
+    }
+
+    private static String getProtocol(URL url) {
+        return url.getProtocol();
+    }
+
+    private static String getPath(URL url) {
+        return url.getPath();
+    }
+
+    private static Integer getPort(URL url) {
+        int port = url.getPort();
+        if (port == -1) return null;
+        return port;
+    }
+
+    private static Map<String, String> getQueryParams(URL url) {
+        String queryString = url.getQuery();
+        if (queryString == null)
+            return null;
+        String[] qFields = queryString.split("&");
+        String[] kv;
+        Map<String, String> queryParams = new HashMap<String, String>();
+        for (int i = 0; i < qFields.length; i++) {
+            kv = qFields[i].split("=");
+            // consider only valid query params AND skip duplicate keys
+            if (kv.length == 2 && !(queryParams.containsKey(kv[0]))) {
+                    queryParams.put(kv[0], kv[1]);
+            }
+        }
+        return queryParams;
+    }
+
+    /**
+     * Apache Pig UDF that provides information about URLs
+     * @param tuple containing URL string
+     * @return tuple containing domain name, host name, protocol, path,
+     * port and query parameters (in that order)
+     *
+     */
+    @Override
+    public Tuple exec(Tuple tuple) throws IOException {
+        URL url;
+        final Tuple output = TupleFactory.getInstance()
+                .newTuple(TUPLE_ELEMENTS);
+        for (int i = 0; i < TUPLE_ELEMENTS; i++)
+            output.set(i, null);
+        if (tuple == null)
+            return output;
+        String input = tuple.get(0) == null ? "" : tuple.get(0).toString()
+                .trim();
+        try {
+            url = new URL(input);
+        } catch (MalformedURLException e) {
+            return null;
+        }
+
+        output.set(0, getDomain(url));
+        output.set(1, getHost(url));
+        output.set(2, getProtocol(url));
+        output.set(3, getPath(url));
+        output.set(4, getPort(url));
+        output.set(5, getQueryParams(url));
+        return output;
+    }
+
+    @Override
+    public Schema outputSchema(Schema input) {
+        try {
+            Schema tupleSchema = new Schema();
+            tupleSchema.add(new Schema.FieldSchema(
+                    "domain", DataType.CHARARRAY));
+            tupleSchema.add(new Schema.FieldSchema(
+                    "host", DataType.CHARARRAY));
+            tupleSchema.add(new Schema.FieldSchema(
+                    "protocol", DataType.CHARARRAY));
+            tupleSchema.add(new Schema.FieldSchema(
+                    "path", DataType.CHARARRAY));
+            tupleSchema.add(new Schema.FieldSchema(
+                    "port", DataType.INTEGER));
+            tupleSchema.add(new Schema.FieldSchema(
+                    "queryParams", DataType.MAP));
+
+            return new Schema(new Schema.FieldSchema(
+                    getSchemaName(this.getClass().getName().toLowerCase(),
+                            input), tupleSchema, DataType.TUPLE));
+        } catch (Exception e) {
+            System.out.println("exception in URLInfo outputSchema: '" + e
+                    + "'; returning null schema.\n");
+            return null;
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/b7bef9c8/datafu-pig/src/test/java/datafu/test/pig/urls/URLInfoTest.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/urls/URLInfoTest.java b/datafu-pig/src/test/java/datafu/test/pig/urls/URLInfoTest.java
new file mode 100644
index 0000000..8f77d08
--- /dev/null
+++ b/datafu-pig/src/test/java/datafu/test/pig/urls/URLInfoTest.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.urls;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+import static org.testng.Assert.*;
+
+import datafu.test.pig.PigTests;
+
+public class URLInfoTest extends PigTests
+{
+    /**
+     define URLInfo datafu.pig.urls.URLInfo();
+
+     data = load 'input' using PigStorage('|') as (i:int, url:chararray,expected:tuple(domain:chararray,host:chararray,protocol:chararray,path:chararray,port:int),
queryparams:map[chararray]);
+     data_out = foreach data generate i, URLInfo(url) as url_info, expected, queryparams;
+
+     store data_out into 'output';
+     */
+    @Multiline private String urlInfoTest;
+
+    @Test
+    public void urlInfoTest() throws Exception
+    {
+        PigTest test = createPigTestFromString(urlInfoTest);
+
+        String[] input = {
+                "1|http://roger.bar.com/marketing/brand.html?x=foo|(bar.com,roger.bar.com,http,/marketing/brand.html,)|[x#foo]",
+                "2|https://hello.world.org:90/products/data/data.html|(world.org,hello.world.org,https,/products/data/data.html,90)|)",
+                "3|ftp://roger.bar.com/eng/hello.jsp?x=foo&y=bar|(bar.com,roger.bar.com,ftp,/eng/hello.jsp,)|[x#foo,y#bar])",
+                "4|http://hello.world.org:90/products/data/data.html|(world.org,hello.world.org,http,/products/data/data.html,90)|)",
+                "5|http://roger.bar.com/eng/hello.jsp?x=foo&y=bar&x=baz|(bar.com,roger.bar.com,http,/eng/hello.jsp,)|[x#foo,y#bar])"
+        };
+
+        writeLinesToFile("input",input);
+
+        test.runScript();
+
+        for (Tuple t : getLinesForAlias(test, "data_out"))
+        {
+            System.out.println("Validating case " + t.get(0));
+            Tuple actual = (Tuple)t.get(1);
+            Tuple expected = (Tuple)t.get(2);
+            assertEquals(actual.get(0),expected.get(0));
+            assertEquals(actual.get(1),expected.get(1));
+            assertEquals(actual.get(2),expected.get(2));
+            assertEquals(actual.get(3),expected.get(3));
+            assertEquals(actual.get(4),expected.get(4));
+            assertEquals(actual.get(5),t.get(3));
+        }
+    }
+}


Mime
View raw message