drill-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] [drill] cgivre commented on a change in pull request #2112: DRILL-7534: Convert HTTPD Format Plugin to EVF
Date Sun, 15 Nov 2020 14:42:56 GMT

cgivre commented on a change in pull request #2112:
URL: https://github.com/apache/drill/pull/2112#discussion_r523768127



##########
File path: contrib/format-httpd/src/main/java/org/apache/drill/exec/store/httpd/HttpdParser.java
##########
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.httpd;
+
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.physical.resultSet.RowSetLoader;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
+import org.apache.drill.shaded.guava.com.google.common.collect.Maps;
+import nl.basjes.parse.core.Casts;
+import nl.basjes.parse.core.Parser;
+import nl.basjes.parse.core.exceptions.DissectionFailure;
+import nl.basjes.parse.core.exceptions.InvalidDissectorException;
+import nl.basjes.parse.core.exceptions.MissingDissectorsException;
+import nl.basjes.parse.httpdlog.HttpdLoglineParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class HttpdParser {
+
+  private static final Logger logger = LoggerFactory.getLogger(HttpdParser.class);
+
+  public static final String PARSER_WILDCARD = ".*";
+  public static final String REMAPPING_FLAG = "#";
+
+  private final Parser<HttpdLogRecord> parser;
+  private Map<String, String> requestedPaths;
+  private final Map<String, MinorType> mappedColumns;
+  private EnumSet<Casts> casts;
+  private final HttpdLogRecord record;
+  private final String logFormat;
+
+
+  public HttpdParser(final String logFormat, final String timestampFormat) {
+
+    Preconditions.checkArgument(logFormat != null && !logFormat.trim().isEmpty(),
"logFormat cannot be null or empty");
+
+    this.logFormat = logFormat;
+    this.record = new HttpdLogRecord(timestampFormat);
+    this.parser = new HttpdLoglineParser<>(HttpdLogRecord.class, logFormat, timestampFormat);
+    if (timestampFormat != null && !timestampFormat.trim().isEmpty()) {
+      logger.info("Custom timestamp format has been specified. This is an informational note
only as custom timestamps is rather unusual.");
+    }
+    if (logFormat.contains("\n")) {
+      logger.info("Specified logformat is a multiline log format: {}", logFormat);
+    }
+
+    mappedColumns = new HashMap<>();
+  }
+
+  /**
+   * We do not expose the underlying parser or the record which is used to manage the writers.
+   *
+   * @param line log line to tear apart.
+   * @throws DissectionFailure
+   * @throws InvalidDissectorException
+   * @throws MissingDissectorsException
+   */
+  public void parse(final String line) throws DissectionFailure, InvalidDissectorException,
MissingDissectorsException {
+    parser.parse(record, line);
+    record.finishRecord();
+  }
+
+  /*
+   * The parser deals with dots unlike Drill wanting underscores request_referer. For the
sake of simplicity we are
+   * going replace the dots. The resultant output field will look like: request.referer.<br>
+   * Additionally, wild cards will get replaced with .*
+   *
+   * @param drillFieldName name to be cleansed.
+   * @return cleaned string
+   */
+ /* public static String parserFormattedFieldName(String drillFieldName) {
+    String tempFieldName;
+    tempFieldName = LOGFIELDS.get(drillFieldName);
+    return tempFieldName.replace(SAFE_WILDCARD, PARSER_WILDCARD).replaceAll(SAFE_SEPARATOR,
".").replaceAll("\\.\\.", "_");
+  }*/
+
+  public TupleMetadata setupParser()
+          throws NoSuchMethodException, MissingDissectorsException, InvalidDissectorException
{
+
+    SchemaBuilder builder = new SchemaBuilder();
+
+    /*
+     * If the user has selected fields, then we will use them to configure the parser because
this would be the most
+     * efficient way to parse the log.
+     */
+    List<String> allParserPaths = parser.getPossiblePaths();
+
+    /*
+     * Use all possible paths that the parser has determined from the specified log format.
+     */
+
+    requestedPaths = Maps.newHashMap();
+    for (final String parserPath : allParserPaths) {
+      requestedPaths.put(HttpdUtils.drillFormattedFieldName(parserPath), parserPath);
+    }
+
+    /*
+     * By adding the parse target to the dummy instance we activate it for use. Which we
can then use to find out which
+     * paths cast to which native data types. After we are done figuring this information
out, we throw this away
+     * because this will be the slowest parsing path possible for the specified format.
+     */
+    Parser<Object> dummy = new HttpdLoglineParser<>(Object.class, logFormat);
+
+    // TODO Don't we want requested paths here... not the all possible

Review comment:
       Fixed




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



Mime
View raw message