nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Mattmann, Chris A (3980)" <chris.a.mattm...@jpl.nasa.gov>
Subject Re: svn commit: r1637236 - in /nutch: branches/2.x/ branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ trunk/ trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/
Date Sat, 08 Nov 2014 17:17:50 GMT
Thanks Seb!

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Chris Mattmann, Ph.D.
Chief Architect
Instrument Software and Science Data Systems Section (398)
NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA
Office: 168-519, Mailstop: 168-527
Email: chris.a.mattmann@nasa.gov
WWW:  http://sunset.usc.edu/~mattmann/
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adjunct Associate Professor, Computer Science Department
University of Southern California, Los Angeles, CA 90089 USA
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++






-----Original Message-----
From: "snagel@apache.org" <snagel@apache.org>
Reply-To: "dev@nutch.apache.org" <dev@nutch.apache.org>
Date: Thursday, November 6, 2014 at 1:51 PM
To: "commits@nutch.apache.org" <commits@nutch.apache.org>
Subject: svn commit: r1637236 - in /nutch: branches/2.x/
branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/ht
tp/ trunk/ 
trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/

>Author: snagel
>Date: Thu Nov  6 21:51:46 2014
>New Revision: 1637236
>
>URL: http://svn.apache.org/r1637236
>Log:
>NUTCH-1825 protocol-http may hang for certain web pages
>
>Modified:
>    nutch/branches/2.x/CHANGES.txt
>    
>nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/prot
>ocol/http/HttpResponse.java
>    nutch/trunk/CHANGES.txt
>    
>nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/ht
>tp/HttpResponse.java
>
>Modified: nutch/branches/2.x/CHANGES.txt
>URL: 
>http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1637236&r1
>=1637235&r2=1637236&view=diff
>==========================================================================
>====
>--- nutch/branches/2.x/CHANGES.txt (original)
>+++ nutch/branches/2.x/CHANGES.txt Thu Nov  6 21:51:46 2014
>@@ -2,6 +2,8 @@ Nutch Change Log
> 
> Current Development 2.3-SNAPSHOT
> 
>+* NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via
>snagel)
>+
> * NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério
>Pereira Araújo, Mengying Wang, snagel)
> 
> * NUTCH-1885 Protocol-file should treat symbolic links as redirects
>(Mengying Wang, snagel)
>
>Modified: 
>nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/prot
>ocol/http/HttpResponse.java
>URL: 
>http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-http/s
>rc/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1637236&r1=16
>37235&r2=1637236&view=diff
>==========================================================================
>====
>--- 
>nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/prot
>ocol/http/HttpResponse.java (original)
>+++ 
>nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/prot
>ocol/http/HttpResponse.java Thu Nov  6 21:51:46 2014
>@@ -278,11 +278,22 @@ public class HttpResponse implements Res
> 
>     ByteArrayOutputStream out = new
>ByteArrayOutputStream(Http.BUFFER_SIZE);
>     byte[] bytes = new byte[Http.BUFFER_SIZE];
>-    int length = 0;                           // read content
>-    for (int i = in.read(bytes); i != -1 && length + i <= contentLength;
>i = in.read(bytes)) {
>-
>+    int length = 0;
>+    // read content
>+    int i = in.read(bytes);
>+    while (i != -1) {
>       out.write(bytes, 0, i);
>       length += i;
>+      if (length >= contentLength) {
>+        break;
>+      }
>+      if ((length + Http.BUFFER_SIZE) > contentLength) {
>+        // reading next chunk may hit contentLength,
>+        // must limit number of bytes read
>+        i = in.read(bytes, 0, (contentLength - length));
>+      } else {
>+        i = in.read(bytes);
>+      }
>     }
>     content = out.toByteArray();
>   }
>
>Modified: nutch/trunk/CHANGES.txt
>URL: 
>http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1637236&r1=163723
>5&r2=1637236&view=diff
>==========================================================================
>====
>--- nutch/trunk/CHANGES.txt (original)
>+++ nutch/trunk/CHANGES.txt Thu Nov  6 21:51:46 2014
>@@ -2,6 +2,8 @@ Nutch Change Log
> 
> Nutch Current Development 1.10-SNAPSHOT
> 
>+* NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via
>snagel)
>+
> * NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério
>Pereira Araújo, Mengying Wang, snagel)
> 
> * NUTCH-1885 Protocol-file should treat symbolic links as redirects
>(Mengying Wang, snagel)
>
>Modified: 
>nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/ht
>tp/HttpResponse.java
>URL: 
>http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java
>/org/apache/nutch/protocol/http/HttpResponse.java?rev=1637236&r1=1637235&r
>2=1637236&view=diff
>==========================================================================
>====
>--- 
>nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/ht
>tp/HttpResponse.java (original)
>+++ 
>nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/ht
>tp/HttpResponse.java Thu Nov  6 21:51:46 2014
>@@ -26,16 +26,14 @@ import java.io.PushbackInputStream;
> import java.net.InetSocketAddress;
> import java.net.Socket;
> import java.net.URL;
>-
> import java.util.Arrays;
> import java.util.HashSet;
> import java.util.Set;
>- 
>+
> import javax.net.ssl.SSLSocket;
> import javax.net.ssl.SSLSocketFactory;
> 
> import org.apache.hadoop.conf.Configuration;
>-
> import org.apache.nutch.crawl.CrawlDatum;
> import org.apache.nutch.metadata.Metadata;
> import org.apache.nutch.metadata.SpellCheckedMetadata;
>@@ -289,11 +287,22 @@ public class HttpResponse implements Res
> 
>     ByteArrayOutputStream out = new
>ByteArrayOutputStream(Http.BUFFER_SIZE);
>     byte[] bytes = new byte[Http.BUFFER_SIZE];
>-    int length = 0;                           // read content
>-    for (int i = in.read(bytes); i != -1 && length + i <= contentLength;
>i = in.read(bytes)) {
>-
>+    int length = 0;
>+    // read content
>+    int i = in.read(bytes);
>+    while (i != -1) {
>       out.write(bytes, 0, i);
>       length += i;
>+      if (length >= contentLength) {
>+        break;
>+      }
>+      if ((length + Http.BUFFER_SIZE) > contentLength) {
>+        // reading next chunk may hit contentLength,
>+        // must limit number of bytes read
>+        i = in.read(bytes, 0, (contentLength - length));
>+      } else {
>+        i = in.read(bytes);
>+      }
>     }
>     content = out.toByteArray();
>   }
>
>


Mime
View raw message