nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (NUTCH-2038) Naive Bayes classifier based url filter
Date Fri, 19 Jun 2015 03:19:01 GMT

    [ https://issues.apache.org/jira/browse/NUTCH-2038?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14592945#comment-14592945
] 

ASF GitHub Bot commented on NUTCH-2038:
---------------------------------------

Github user chrismattmann commented on a diff in the pull request:

    https://github.com/apache/nutch/pull/32#discussion_r32798921
  
    --- Diff: src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NBClassifier.java
---
    @@ -0,0 +1,234 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.nutch.urlfilter.model;
    +
    +import java.io.BufferedReader;
    +import java.io.FileReader;
    +import java.io.IOException;
    +import java.io.StringReader;
    +import java.util.HashMap;
    +import java.util.Map;
    +
    +import org.apache.hadoop.conf.Configuration;
    +import org.apache.hadoop.fs.FileSystem;
    +import org.apache.hadoop.fs.Path;
    +import org.apache.hadoop.io.IntWritable;
    +import org.apache.hadoop.io.LongWritable;
    +import org.apache.hadoop.io.SequenceFile;
    +import org.apache.hadoop.io.SequenceFile.Writer;
    +import org.apache.hadoop.io.Text;
    +import org.apache.lucene.analysis.Analyzer;
    +import org.apache.lucene.analysis.TokenStream;
    +import org.apache.lucene.analysis.standard.StandardAnalyzer;
    +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    +import org.apache.lucene.util.Version;
    +import org.apache.mahout.classifier.naivebayes.BayesUtils;
    +import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
    +import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier;
    +import org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob;
    +import org.apache.mahout.common.Pair;
    +import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
    +import org.apache.mahout.math.RandomAccessSparseVector;
    +import org.apache.mahout.math.Vector;
    +import org.apache.mahout.math.Vector.Element;
    +import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles;
    +import org.apache.mahout.vectorizer.TFIDF;
    +
    +import com.google.common.collect.ConcurrentHashMultiset;
    +import com.google.common.collect.Multiset;
    +
    +public class NBClassifier {
    +
    +	public static Map<String, Integer> readDictionnary(Configuration conf,
    +			Path dictionnaryPath) {
    +		Map<String, Integer> dictionnary = new HashMap<String, Integer>();
    +		for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(
    +				dictionnaryPath, true, conf)) {
    +			dictionnary.put(pair.getFirst().toString(), pair.getSecond().get());
    +		}
    +		return dictionnary;
    +	}
    +
    +	public static Map<Integer, Long> readDocumentFrequency(Configuration conf,
    +			Path documentFrequencyPath) {
    +		Map<Integer, Long> documentFrequency = new HashMap<Integer, Long>();
    +		for (Pair<IntWritable, LongWritable> pair : new SequenceFileIterable<IntWritable,
LongWritable>(
    +				documentFrequencyPath, true, conf)) {
    +			documentFrequency
    +					.put(pair.getFirst().get(), pair.getSecond().get());
    +		}
    +		return documentFrequency;
    +	}
    +
    +	public static void createModel(String inputTrainFilePath) throws Exception {
    +
    +		String[] args1 = new String[4];
    +
    +		args1[0] = "-i";
    +		args1[1] = "outseq";
    +		args1[2] = "-o";
    +		args1[3] = "vectors";
    +
    +		String[] args2 = new String[9];
    +
    +		args2[0] = "-i";
    +		args2[1] = "vectors/tfidf-vectors";
    +		args2[2] = "-el";
    +		args2[3] = "-li";
    +		args2[4] = "labelindex";
    +		args2[5] = "-o";
    +		args2[6] = "model";
    +		args2[7] = "-ow";
    +		args2[8] = "-c";
    +
    +		convertToSeq(inputTrainFilePath, "outseq");
    +
    +		SparseVectorsFromSequenceFiles.main(args1);
    +
    +		TrainNaiveBayesJob.main(args2);
    +	}
    +
    +	public static String classify(String text) throws IOException {
    +		return classify(text, "model", "labelindex",
    +				"vectors/dictionary.file-0", "vectors/df-count/part-r-00000");
    +	}
    +
    +	public static String classify(String text, String modelPath,
    +			String labelIndexPath, String dictionaryPath,
    +			String documentFrequencyPath) throws IOException {
    +
    +		Configuration configuration = new Configuration();
    +
    +		// model is a matrix (wordId, labelId) => probability score
    +		NaiveBayesModel model = NaiveBayesModel.materialize(
    +				new Path(modelPath), configuration);
    +
    +		StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(
    +				model);
    +
    +		// labels is a map label => classId
    +		Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration,
    +				new Path(labelIndexPath));
    +		Map<String, Integer> dictionary = readDictionnary(configuration,
    +				new Path(dictionaryPath));
    +		Map<Integer, Long> documentFrequency = readDocumentFrequency(
    +				configuration, new Path(documentFrequencyPath));
    +
    +		// analyzer used to extract word from text
    +		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
    +		// int labelCount = labels.size();
    +		int documentCount = documentFrequency.get(-1).intValue();
    +
    +		Multiset<String> words = ConcurrentHashMultiset.create();
    +
    +		// extract words from text
    +		TokenStream ts = analyzer.tokenStream("text", new StringReader(text));
    +		CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    +		ts.reset();
    +		int wordCount = 0;
    +		while (ts.incrementToken()) {
    +			if (termAtt.length() > 0) {
    +				String word = ts.getAttribute(CharTermAttribute.class)
    +						.toString();
    +				Integer wordId = dictionary.get(word);
    +				// if the word is not in the dictionary, skip it
    +				if (wordId != null) {
    +					words.add(word);
    +					wordCount++;
    +				}
    +			}
    +		}
    +
    +		ts.end();
    +		ts.close();
    +		// create vector wordId => weight using tfidf
    +		Vector vector = new RandomAccessSparseVector(10000);
    +		TFIDF tfidf = new TFIDF();
    +		for (Multiset.Entry<String> entry : words.entrySet()) {
    +			String word = entry.getElement();
    +			int count = entry.getCount();
    +			Integer wordId = dictionary.get(word);
    +			Long freq = documentFrequency.get(wordId);
    +			double tfIdfValue = tfidf.calculate(count, freq.intValue(),
    +					wordCount, documentCount);
    +			vector.setQuick(wordId, tfIdfValue);
    +		}
    +		// one score for each label
    +
    +		Vector resultVector = classifier.classifyFull(vector);
    +		double bestScore = -Double.MAX_VALUE;
    +		int bestCategoryId = -1;
    +		for (Element element : resultVector.all()) {
    +			int categoryId = element.index();
    +			double score = element.get();
    +			if (score > bestScore) {
    +				bestScore = score;
    +				bestCategoryId = categoryId;
    +			}
    +
    +		}
    +
    +		analyzer.close();
    +		return labels.get(bestCategoryId);
    +
    +	}
    +
    +	static void convertToSeq(String inputFileName, String outputDirName)
    +			throws IOException {
    +		Configuration configuration = new Configuration();
    +		FileSystem fs = FileSystem.get(configuration);
    +		Writer writer = new SequenceFile.Writer(fs, configuration, new Path(
    +				outputDirName + "/chunk-0"), Text.class, Text.class);
    +
    +		BufferedReader reader = new BufferedReader(
    +				new FileReader(inputFileName));
    +		Text key = new Text();
    +		Text value = new Text();
    +		while (true) {
    +			String line = reader.readLine();
    +			if (line == null) {
    +				break;
    +			}
    +			String[] tokens = line.split("\t", 3);
    +			if (tokens.length != 3) {
    +				// System.out.println("Skip line: " + line);
    +				continue;
    +			}
    +			String category = tokens[0];
    +			String id = tokens[1];
    +			String message = tokens[2];
    +			key.set("/" + category + "/" + id);
    +			value.set(message);
    +			writer.append(key, value);
    +
    +		}
    +		reader.close();
    +		writer.close();
    +
    +	}
    +
    +	public static void main(String args[]) throws Exception {
    --- End diff --
    
    +1


> Naive Bayes classifier based url filter
> ---------------------------------------
>
>                 Key: NUTCH-2038
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2038
>             Project: Nutch
>          Issue Type: New Feature
>          Components: fetcher, injector, parser
>            Reporter: Asitang Mishra
>            Assignee: Chris A. Mattmann
>              Labels: memex, nutch
>             Fix For: 1.11
>
>
> A url filter that will filter out the urls (after the parsing stage,  will keep only
those urls that contain some "hot words" provided again in a list.) from that pages that are
classified irrelevant by the classifier.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Mime
View raw message