nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Markus Jelsma (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (NUTCH-1693) TextMD5Signatue compute on textual content
Date Wed, 08 Jan 2014 11:00:52 GMT

    [ https://issues.apache.org/jira/browse/NUTCH-1693?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13865306#comment-13865306
] 

Markus Jelsma commented on NUTCH-1693:
--------------------------------------

By the way, there are several places in Nutch that still use getBytes(), would you suggest
we do something about that too?

{code}
markus@midas:~/projects/apache/nutch/trunk$ grep -nr getBytes src/ | grep -v svn
src/test/org/apache/nutch/protocol/TestContent.java:48:    Content r = new Content(url, url,
page.getBytes("UTF8"), "text/html",
src/test/org/apache/nutch/protocol/TestContent.java:64:                    "".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:70:                    "".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:76:                    "".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:82:                    "<html></html>".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:88:                    "<html></html>".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:94:                    "<html></html>".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:100:                    "".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:106:                    "".getBytes("UTF8"),
src/test/org/apache/nutch/util/TestGZIPUtils.java:121:    byte[] testBytes= SHORT_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:123:    testBytes= LONGER_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:125:    testBytes= WEBPAGE.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:130:    byte[] testBytes= SHORT_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:132:    testBytes= LONGER_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:134:    testBytes= WEBPAGE.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:139:    byte[] testBytes= SHORT_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:141:    testBytes= LONGER_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:143:    testBytes= WEBPAGE.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:148:    byte[] testBytes= SHORT_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:150:    testBytes= LONGER_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:152:    testBytes= WEBPAGE.getBytes();
src/test/org/apache/nutch/util/TestEncodingDetector.java:35:      contentInOctets = "çñôöøДЛжҶ".getBytes("utf-8");
src/test/org/apache/nutch/util/TestNodeWalker.java:65:      parser.parse(new InputSource(new
ByteArrayInputStream(WEBPAGE.getBytes())));
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:58:      byte[] bytes = testA.getBytes("UTF-8");
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:62:      os.write(p.getBytes());
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:80:        os.write(link.getBytes());
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:86:        os.write(link.getBytes());
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:92:      os.write(link.getBytes());
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:93:      os.write(testB.getBytes());
src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java:318:        Content content = new
Content(urlStr, urlStr, bytes.getBytes(), contentType,
src/java/org/apache/nutch/tools/Benchmark.java:65:      os.write(url.getBytes());
src/java/org/apache/nutch/crawl/MD5Signature.java:35:    if (data == null) data = content.getUrl().getBytes();
src/java/org/apache/nutch/crawl/Generator.java:375:      int hash1 = hash(url1.getBytes(),
0, url1.getLength());
src/java/org/apache/nutch/crawl/Generator.java:376:      int hash2 = hash(url2.getBytes(),
0, url2.getLength());
src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java:512:    return
new String(x).getBytes();
src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java:151:
     byte[] bytes= tests[i].getBytes();
src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java:242: 
            new ByteArrayInputStream(testPages[i].getBytes()) ),
src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java:308:  
 return new String(x).getBytes();
src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java:92:
       byte[] credBytes = (username + ":" + password).getBytes();
src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:84:
   rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT);
src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:93:
   rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, MULTIPLE_AGENTS);
src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:109:
   rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT);
src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:113:
   rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, UNKNOWN_AGENT);
src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java:152:   
  byte[] bytes= tests[i].getBytes();
src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java:187:       
    new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java:128:
   return new Content(URL, BASE, text.getBytes(), "text/html", meta, NutchConfiguration.create());
src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java:82:    fos.write(expectedText.getBytes());
src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java:267:          new Content(link,
link, text.getBytes(), contentType, contentMeta,
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java:154:  
   byte[] reqBytes= reqStr.toString().getBytes();
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java:403:  
     in.unread(line.substring(pos).getBytes("UTF-8"));
src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java:67: 
  InputStream is=new ByteArrayInputStream(xml.toString().getBytes());

{code}

> TextMD5Signatue compute on textual content
> ------------------------------------------
>
>                 Key: NUTCH-1693
>                 URL: https://issues.apache.org/jira/browse/NUTCH-1693
>             Project: Nutch
>          Issue Type: New Feature
>            Reporter: Tien Nguyen Manh
>            Assignee: Markus Jelsma
>            Priority: Minor
>             Fix For: 2.3, 1.8
>
>         Attachments: NUTCH-1693-trunk.patch, NUTCH-1693-trunk.patch, NUTCH-1693.patch
>
>
> I create a new MD5Signature that based on textual content. In our case we use boilerpipe
to extract main text from content so this signature is more effective to deduplicate.



--
This message was sent by Atlassian JIRA
(v6.1.5#6160)

Mime
View raw message