nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sebastian-nagel <...@git.apache.org>
Subject [GitHub] nutch pull request: Fix for NUTCH-2139 contributed by jorgelbg
Date Thu, 15 Oct 2015 18:23:31 GMT
Github user sebastian-nagel commented on a diff in the pull request:

    https://github.com/apache/nutch/pull/78#discussion_r42161108
  
    --- Diff: src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
---
    @@ -0,0 +1,168 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +package org.apache.nutch.indexer.links;
    +
    +import org.apache.hadoop.conf.Configuration;
    +import org.apache.hadoop.io.Text;
    +import org.apache.nutch.crawl.CrawlDatum;
    +import org.apache.nutch.crawl.Inlink;
    +import org.apache.nutch.crawl.Inlinks;
    +import org.apache.nutch.indexer.IndexingException;
    +import org.apache.nutch.indexer.IndexingFilter;
    +import org.apache.nutch.indexer.NutchDocument;
    +import org.apache.nutch.parse.Outlink;
    +import org.apache.nutch.parse.Parse;
    +import org.slf4j.LoggerFactory;
    +
    +import java.net.MalformedURLException;
    +import java.net.URL;
    +import java.util.HashSet;
    +import java.util.Iterator;
    +import java.util.Set;
    +
    +/**
    + * An {@link org.apache.nutch.indexer.IndexingFilter} that adds
    + * <code>outlinks</code> and <code>inlinks</code> field(s) to
the document.
    + *
    + * In case that you want to ignore the outlinks that point to the same host
    + * as the URL being indexed use the following settings in your configuration
    + * file:
    + *
    + * <property>
    + *   <name>outlinks.host.ignore</name>
    + *   <value>true</value>
    + * </property>
    + *
    + * The same configuration is available for inlinks:
    + *
    + * <property>
    + *   <name>inlinks.host.ignore</name>
    + *   <value>true</value>
    + * </property>
    + *
    + * To store only the host portion of each inlink URL or outlink URL add the
    + * following to your configuration file.
    + *
    + * <property>
    + *   <name>links.hosts.only</name>
    + *   <value>false</value>
    + * </property>
    + *
    + */
    +public class LinksIndexingFilter implements IndexingFilter {
    +
    +  public final static String LINKS_OUTLINKS_HOST = "outlinks.host.ignore";
    +  public final static String LINKS_INLINKS_HOST = "inlinks.host.ignore";
    +  public final static String LINKS_ONLY_HOSTS = "links.hosts.only";
    +
    +  public final static org.slf4j.Logger LOG = LoggerFactory
    +      .getLogger(LinksIndexingFilter.class);
    +
    +  private Configuration conf;
    +  private boolean filterOutlinks;
    +  private boolean filterInlinks;
    +  private boolean indexHost;
    +
    +  @Override
    +  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    +      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
    +
    +    // Add the outlinks
    +    Outlink[] outlinks = parse.getData().getOutlinks();
    +
    +    try {
    +      if (outlinks != null) {
    --- End diff --
    
    see comment below regarding nesting of try-catch and loops: outlinks are not necessarily
validated inside parse data


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

Mime
View raw message