nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ysc (JIRA)" <j...@apache.org>
Subject [jira] [Issue Comment Deleted] (NUTCH-1736) can't fetch page if http response header contains Transfer-Encoding:chunked
Date Sat, 15 Mar 2014 07:40:43 GMT

     [ https://issues.apache.org/jira/browse/NUTCH-1736?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]

ysc updated NUTCH-1736:
-----------------------

    Comment: was deleted

(was: 1、for nutch1.x can use the below patch:

#P nutch1.7
Index: src/java/org/apache/nutch/metadata/HttpHeaders.java
===================================================================
--- src/java/org/apache/nutch/metadata/HttpHeaders.java	(revision 1573324)
+++ src/java/org/apache/nutch/metadata/HttpHeaders.java	(working copy)
@@ -26,6 +26,8 @@
  */
 public interface HttpHeaders {
 
+  public final static String TRANSFER_ENCODING = "Transfer-Encoding";
+	
   public final static String CONTENT_ENCODING = "Content-Encoding";
   
   public final static String CONTENT_LANGUAGE = "Content-Language";
Index: src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
===================================================================
--- src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java	(revision
1573324)
+++ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java	(working
copy)
@@ -156,9 +156,13 @@
         parseHeaders(in, line);
         haveSeenNonContinueStatus= code != 100; // 100 is "Continue"
       }
+      String transferEncoding = getHeader(Response.TRANSFER_ENCODING); 
+      if(transferEncoding != null && "chunked".equalsIgnoreCase(transferEncoding.trim())){
   	  
+    	  readChunkedContent(in, line);  
+      }else{
+    	  readPlainContent(in);  
+      }
 
-      readPlainContent(in);
-
       String contentEncoding = getHeader(Response.CONTENT_ENCODING);
       if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
         content = http.processGzipEncoded(content, url);
@@ -432,5 +436,4 @@
     in.unread(value);
     return value;
   }
-
 }

2、for nutch2.x can use the below patch:

#P nutch-2.2.1
Index: src/java/org/apache/nutch/metadata/HttpHeaders.java
===================================================================
--- src/java/org/apache/nutch/metadata/HttpHeaders.java	(revision 1523958)
+++ src/java/org/apache/nutch/metadata/HttpHeaders.java	(working copy)
@@ -28,6 +28,7 @@
  * @author J&eacute;r&ocirc;me Charron
  */
 public interface HttpHeaders {
+  public final static String TRANSFER_ENCODING = "Transfer-Encoding";
 
   public final static String CONTENT_ENCODING = "Content-Encoding";
 
Index: src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
===================================================================
--- src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java	(revision
1523958)
+++ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java	(working
copy)
@@ -150,7 +150,12 @@
         haveSeenNonContinueStatus= code != 100; // 100 is "Continue"
       }
 
-      readPlainContent(in);
+	  String transferEncoding = getHeader(Response.TRANSFER_ENCODING); 
+	  if(transferEncoding != null && "chunked".equalsIgnoreCase(transferEncoding.trim())){
   	  
+		 readChunkedContent(in, line);  
+	  }else{
+		 readPlainContent(in);  
+	  }
 
       String contentEncoding = getHeader(Response.CONTENT_ENCODING);
       if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
@@ -234,7 +239,92 @@
     }
     content = out.toByteArray();
   }
+  /**
+   * 
+   * @param in
+   * @param line
+   * @throws HttpException
+   * @throws IOException
+   */
+  @SuppressWarnings("unused")
+  private void readChunkedContent(PushbackInputStream in,  
+                                  StringBuffer line) 
+    throws HttpException, IOException {
+    boolean doneChunks= false;
+    int contentBytesRead= 0;
+    byte[] bytes = new byte[Http.BUFFER_SIZE];
+    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
 
+    while (!doneChunks) {
+      if (Http.LOG.isTraceEnabled()) {
+        Http.LOG.trace("Http: starting chunk");
+      }
+
+      readLine(in, line, false);
+
+      String chunkLenStr;
+      // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); }
+
+      int pos= line.indexOf(";");
+      if (pos < 0) {
+        chunkLenStr= line.toString();
+      } else {
+        chunkLenStr= line.substring(0, pos);
+        // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + line.substring(pos+1));
}
+      }
+      chunkLenStr= chunkLenStr.trim();
+      int chunkLen;
+      try {
+        chunkLen= Integer.parseInt(chunkLenStr, 16);
+      } catch (NumberFormatException e){ 
+        throw new HttpException("bad chunk length: "+line.toString());
+      }
+
+      if (chunkLen == 0) {
+        doneChunks= true;
+        break;
+      }
+
+      if ( (contentBytesRead + chunkLen) > http.getMaxContent() )
+        chunkLen= http.getMaxContent() - contentBytesRead;
+
+      // read one chunk
+      int chunkBytesRead= 0;
+      while (chunkBytesRead < chunkLen) {
+
+        int toRead= (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
+                    (chunkLen - chunkBytesRead) : Http.BUFFER_SIZE;
+        int len= in.read(bytes, 0, toRead);
+
+        if (len == -1) 
+          throw new HttpException("chunk eof after " + contentBytesRead
+                                      + " bytes in successful chunks"
+                                      + " and " + chunkBytesRead 
+                                      + " in current chunk");
+
+        // DANGER!!! Will printed GZIPed stuff right to your
+        // terminal!
+        // if (LOG.isTraceEnabled()) { LOG.trace("read: " +  new String(bytes, 0, len));
}
+
+        out.write(bytes, 0, len);
+        chunkBytesRead+= len;  
+      }
+
+      readLine(in, line, false);
+
+    }
+
+    if (!doneChunks) {
+      if (contentBytesRead != http.getMaxContent()) 
+        throw new HttpException("chunk eof: !doneChunk && didn't max out");
+      return;
+    }
+
+    content = out.toByteArray();
+    parseHeaders(in, line);
+
+  }
+  
   private int parseStatusLine(PushbackInputStream in, StringBuffer line)
     throws IOException, HttpException {
     readLine(in, line, false);
)

> can't fetch page if http response header contains Transfer-Encoding:chunked
> ---------------------------------------------------------------------------
>
>                 Key: NUTCH-1736
>                 URL: https://issues.apache.org/jira/browse/NUTCH-1736
>             Project: Nutch
>          Issue Type: Bug
>          Components: protocol
>    Affects Versions: 1.6, 2.1, 1.7, 2.2, 2.3, 1.8, 2.4, 1.9, 2.2.1
>            Reporter: ysc
>            Priority: Critical
>   Original Estimate: 24h
>  Remaining Estimate: 24h
>
> fetching: http://szs.mof.gov.cn/zhengwuxinxi/zhengcefabu/201402/t20140224_1046354.html
> Fetch failed with protocol status: EXCEPTION: java.io.IOException: unzipBestEffort returned
null



--
This message was sent by Atlassian JIRA
(v6.2#6252)

Mime
View raw message