poi-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From bugzi...@apache.org
Subject DO NOT REPLY [Bug 38616] - patch that contains functionality that allows picture extraction from Microsoft Word
Date Thu, 23 Feb 2006 13:19:37 GMT
DO NOT REPLY TO THIS EMAIL, BUT PLEASE POST YOUR BUG·
RELATED COMMENTS THROUGH THE WEB INTERFACE AVAILABLE AT
<http://issues.apache.org/bugzilla/show_bug.cgi?id=38616>.
ANY REPLY MADE TO THIS MESSAGE WILL NOT BE COLLECTED AND·
INSERTED IN THE BUG DATABASE.

http://issues.apache.org/bugzilla/show_bug.cgi?id=38616





------- Additional Comments From DmitryR@kyiv.vdiweb.com  2006-02-23 14:19 -------
(From update of attachment 17657)
Index:
D:/java/svn-apache/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
===================================================================
--- D:/java/svn-apache/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
(revision 376982)
+++ D:/java/svn-apache/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
(working copy)
@@ -89,6 +89,9 @@
   /** Hold list tables */
   protected ListTables _lt;

+  /** Holds pictures table */
+  protected PicturesTable _pictures;
+
   protected HWPFDocument()
   {

@@ -152,6 +155,7 @@
	 _dataStream = new byte[0];
     }

+    _pictures = new PicturesTable(_dataStream);
     // get the start of text in the main stream
     int fcMin = _fib.getFcMin();

@@ -237,7 +241,16 @@
   {
     return _lt;
   }
+
   /**
+   * @return PicturesTable object, that is able to extract images from this
document
+   */
+  public PicturesTable getPicturesTable()
+  {
+    return _pictures;
+  }
+
+  /**
    * Writes out the word file that is represented by an instance of this
class.
    *
    * @param out The OutputStream to write to.
Index:
D:/java/svn-apache/src/scratchpad/src/org/apache/poi/hwpf/model/PicturesTable.j
ava
===================================================================
---
D:/java/svn-apache/src/scratchpad/src/org/apache/poi/hwpf/model/PicturesTable.j
ava  (revision 0)
+++
D:/java/svn-apache/src/scratchpad/src/org/apache/poi/hwpf/model/PicturesTable.j
ava  (revision 0)
@@ -0,0 +1,114 @@
+/* ====================================================================
+   Copyright 2002-2006   Apache Software Foundation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+
+package org.apache.poi.hwpf.model;
+
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Picture;
+
+import java.util.List;
+import java.util.ArrayList;
+
+
+/**
+ * Holds information about all pictures embedded in Word Document either via
"Insert -> Picture -> From File" or via
+ * clipboard. Responsible for images extraction and determining whether some
document&#1058;s piece contains embedded image.
+ * Analyzes raw data bytestream &#1059;Data&#1060; (where Word stores all
embedded objects) provided by HWPFDocument.
+ *
+ * Word stores images as is within so called "Data stream" - the stream within
a Word docfile containing various data
+ * that hang off of characters in the main stream. For example, binary data
describing in-line pictures and/or
+ * formfields an also embedded objects-native data. Word picture structures
are concatenated one after the other in
+ * the data stream if the document contains pictures.
+ * Data stream is easily reachable via HWPFDocument._dataStream property.
+ * A picture is represented in the document text stream as a special
character, an Unicode \u0001 whose
+ * CharacterRun.isSpecial() returns true. The file location of the picture in
the Word binary file is accessed
+ * via CharacterRun.getPicOffset(). The CharacterRun.getPicOffset() is a byte
offset into the data stream.
+ * Beginning at the position recorded in picOffset, a header data structure,
will be stored.
+ *
+ * @author Dmitry Romanov
+ */
+public class PicturesTable
+{
+  static final int TYPE_IMAGE = 0x08;
+  static final int TYPE_IMAGE_PASTED_FROM_CLIPBOARD = 0xA;
+  static final int BLOCK_TYPE_OFFSET = 0xE;
+
+  private byte[] _dataStream;
+
+  /** @link dependency
+   * @stereotype instantiate*/
+  /*# Picture lnkPicture; */
+
+  /**
+   *
+   * @param _dataStream
+   */
+  public PicturesTable(byte[] _dataStream)
+  {
+    this._dataStream = _dataStream;
+  }
+
+  /**
+   * determines whether specified CharacterRun contains reference to a picture
+   * @param run
+   */
+  public boolean hasPicture(CharacterRun run) {
+    if (run.isSpecialCharacter() && !run.isObj() && !run.isOle2() &&
!run.isData() && "\u0001".equals(run.text())) {
+      short blockType = getBlockType(_dataStream, run.getPicOffset());
+      return (blockType == TYPE_IMAGE || blockType ==
TYPE_IMAGE_PASTED_FROM_CLIPBOARD);
+    }
+    return false;
+  }
+
+  private static short getBlockType(byte[] dataStream, int pictOffset) {
+    return LittleEndian.getShort(dataStream, pictOffset + BLOCK_TYPE_OFFSET);
+  }
+
+  /**
+   * Returns picture object tied to specified CharacterRun
+   * @param run
+   * @param fillBytes if true, Picture will be returned with filled byte array
that represent picture's contents. If you don't want
+   * to have that byte array in memory but only write picture's contents to
stream, pass false and then use Picture.writeImageContent
+   * @see Picture#writeImageContent(java.io.OutputStream)
+   * @return a Picture object if picture exists for specified CharacterRun,
null otherwise. PicturesTable.hasPicture is used to determine this.
+   * @see #hasPicture(org.apache.poi.hwpf.usermodel.CharacterRun) 
+   */
+  public Picture extractPicture(CharacterRun run, boolean fillBytes) {
+    if (hasPicture(run)) {
+      return new Picture(run.getPicOffset(), _dataStream, fillBytes);
+    }
+    return null;
+  }
+
+  /**
+   * @return a list of Picture objects found in current document
+   */
+  public List getAllPictures() {
+    int i = 0;
+    ArrayList pictures = new ArrayList();
+    while(i<_dataStream.length) {
+      short blockType = getBlockType(_dataStream, i);
+      if (blockType == TYPE_IMAGE ||
blockType==TYPE_IMAGE_PASTED_FROM_CLIPBOARD) {
+	 pictures.add(new Picture(i, _dataStream, false));
+      }
+      i += LittleEndian.getInt(_dataStream, i);
+    }
+    return pictures;
+  }
+
+}
Index:
D:/java/svn-apache/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Picture.jav
a
===================================================================
---
D:/java/svn-apache/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Picture.jav
a    (revision 0)
+++
D:/java/svn-apache/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Picture.jav
a    (revision 0)
@@ -0,0 +1,341 @@
+/* ====================================================================
+   Copyright 2002-2006   Apache Software Foundation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.util.LittleEndian;
+
+import java.io.OutputStream;
+import java.io.IOException;
+
+/**
+ * Represents embedded picture extracted from Word Document
+ * @author Dmitry Romanov
+ */
+public class Picture
+{
+//  public static final int FILENAME_OFFSET = 0x7C;
+//  public static final int FILENAME_SIZE_OFFSET = 0x6C;
+  static final int BLOCK_TYPE_OFFSET = 0xE;
+  static final int PICT_HEADER_OFFSET = 0x4;
+  static final int UNKNOWN_HEADER_SIZE = 0x49;
+
+  public static final byte[] GIF = new byte[]{'G', 'I', 'F'};
+  public static final byte[] PNG = new byte[]{ (byte)0x89, 0x50, 0x4E,
0x47,0x0D,0x0A,0x1A,0x0A};
+  public static final byte[] JPG = new byte[]{(byte)0xFF, (byte)0xD8};
+  public static final byte[] BMP = new byte[]{'B', 'M'};
+  public static final byte[] TIFF = new byte[]{0x49, 0x49, 0x2A, 0x00};
+  public static final byte[] TIFF1 = new byte[]{0x4D, 0x4D, 0x00, 0x2A};
+
+  public static final byte[] IHDR = new byte[]{'I', 'H', 'D', 'R'};
+
+  private int dataBlockStartOfsset;
+  private int pictureBytesStartOffset;
+  private int dataBlockSize;
+  private int size;
+//  private String fileName;
+  private byte[] content;
+  private byte[] _dataStream;
+  private int aspectRatioX;
+  private int aspectRatioY;
+  private int height = -1;
+  private int width = -1;
+
+
+  public Picture(int dataBlockStartOfsset, byte[] _dataStream, boolean
fillBytes)
+  {
+    this._dataStream = _dataStream;
+    this.dataBlockStartOfsset = dataBlockStartOfsset;
+    this.pictureBytesStartOffset =
getPictureBytesStartOffset(dataBlockStartOfsset, _dataStream);
+    this.dataBlockSize = LittleEndian.getInt(_dataStream,
dataBlockStartOfsset);
+    this.size = dataBlockSize - (pictureBytesStartOffset -
dataBlockStartOfsset);
+
+    this.aspectRatioX = extractAspectRatioX(_dataStream,
dataBlockStartOfsset);
+    this.aspectRatioY = extractAspectRatioY(_dataStream,
dataBlockStartOfsset);
+//    this.fileName = extractFileName(dataBlockStartOfsset, _dataStream);
+//    if (fileName==null || fileName.length()==0) {
+//	 fileName = "clipboard";
+//    }
+
+    if (fillBytes)
+    {
+      fillImageContent(_dataStream);
+    }
+
+    String ext = suggestFileExtension();
+    // trying to extract width and height from pictures content:
+    if ("jpg".equalsIgnoreCase(ext)) {
+      fillJPGWidthHeight();
+    } else if ("png".equalsIgnoreCase(ext)) {
+      fillPNGWidthHeight();
+    }
+  }
+
+  private static int extractAspectRatioX(byte[] _dataStream, int
dataBlockStartOffset)
+  {
+    return LittleEndian.getShort(_dataStream, dataBlockStartOffset+0x20)/10;
+  }
+
+  private static int extractAspectRatioY(byte[] _dataStream, int
dataBlockStartOffset)
+  {
+    return LittleEndian.getShort(_dataStream, dataBlockStartOffset+0x22)/10;
+  }
+
+  /**
+   * Tries to suggest a filename: hex representation of picture structure
offset in "Data" stream plus extension that
+   * is tried to determine from first byte of picture's content.
+   *
+   * @return suggested file name
+   */
+  public String suggestFullFileName()
+  {
+    String fileExt = suggestFileExtension();
+    return Integer.toHexString(dataBlockStartOfsset) + (fileExt.length()>0 ?
"."+fileExt : "");
+  }
+
+  /**
+   * Writes Picture's content bytes to specified OutputStream.
+   * Is useful when there is need to write picture bytes directly to stream,
omitting its representation in
+   * memory as distinct byte array.
+   *
+   * @param out a stream to write to
+   * @throws IOException if some exception is occured while writing to
specified out
+   */
+  public void writeImageContent(OutputStream out) throws IOException
+  {
+    if (content!=null && content.length>0) {
+      out.write(content, 0, size);
+    } else {
+      out.write(_dataStream, pictureBytesStartOffset, size);
+    }
+  }
+
+  /**
+   * @return picture's content as byte array
+   */
+  public byte[] getContent()
+  {
+    if (content == null || content.length<=0)
+    {
+      fillImageContent(this._dataStream);
+    }
+    return content;
+  }
+
+  /**
+   *
+   * @return size in bytes of the picture
+   */
+  public int getSize()
+  {
+    return size;
+  }
+
+  /**
+   * returns horizontal aspect ratio for picture provided by user
+   */
+  public int getAspectRatioX()
+  {
+    return aspectRatioX;
+  }
+  /**
+   * returns vertical aspect ratio for picture provided by user
+   */
+  public int getAspectRatioY()
+  {
+    return aspectRatioY;
+  }
+
+  /**
+   * tries to suggest extension for picture's file by matching signatures of
popular image formats to first bytes
+   * of picture's contents
+   * @return suggested file extension
+   */
+  public String suggestFileExtension()
+  {
+    if (content!=null && content.length>0) {
+      return suggestFileExtension(content, 0);
+    }
+    return suggestFileExtension(_dataStream, pictureBytesStartOffset);
+  }
+
+
+  private String suggestFileExtension(byte[] _dataStream, int
pictureBytesStartOffset)
+  {
+    if (matchSignature(_dataStream, JPG, pictureBytesStartOffset)) {
+      return "jpg";
+    } else if (matchSignature(_dataStream, PNG, pictureBytesStartOffset)) {
+      return "png";
+    } else if (matchSignature(_dataStream, GIF, pictureBytesStartOffset)) {
+      return "gif";
+    } else if (matchSignature(_dataStream, BMP, pictureBytesStartOffset)) {
+      return "bmp";
+    } else if (matchSignature(_dataStream, TIFF, pictureBytesStartOffset)) {
+      return "tiff";
+    } else if (matchSignature(_dataStream, TIFF1, pictureBytesStartOffset)) {
+      return "tiff";
+    }
+    return "";
+  }
+
+  private static boolean matchSignature(byte[] dataStream, byte[] signature,
int pictureBytesOffset)
+  {
+    boolean matched = true;
+    for (int i = 0; i < dataStream.length && i< signature.length; i++)
+    {
+      if (dataStream[i+pictureBytesOffset] != signature[i])
+      {
+	 matched = false;
+	 break;
+      }
+    }
+    return matched;
+  }
+
+//  public String getFileName()
+//  {
+//    return fileName;
+//  }
+
+//  private static String extractFileName(int blockStartIndex, byte[]
dataStream) {
+//	   int fileNameStartOffset = blockStartIndex + 0x7C;
+//	   int fileNameSizeOffset = blockStartIndex + FILENAME_SIZE_OFFSET;
+//	   int fileNameSize = LittleEndian.getShort(dataStream,
fileNameSizeOffset);
+//
+//	   int fileNameIndex = fileNameStartOffset;
+//	   char[] fileNameChars = new char[(fileNameSize-1)/2];
+//	   int charIndex = 0;
+//	   while(charIndex<fileNameChars.length) {
+//	       short aChar = LittleEndian.getShort(dataStream, fileNameIndex);
+//	       fileNameChars[charIndex] = (char)aChar;
+//	       charIndex++;
+//	       fileNameIndex += 2;
+//	   }
+//	   String fileName = new String(fileNameChars);
+//	   return fileName.trim();
+//    }
+
+  private void fillImageContent(byte[] dataStream)
+  {
+    this.content = new byte[size];
+    System.arraycopy(dataStream, pictureBytesStartOffset, content, 0, size);
+  }
+
+  private static int getPictureBytesStartOffset(int dataBlockStartOffset,
byte[] _dataStream)
+  {
+    int realPicoffset = dataBlockStartOffset;
+
+    int PICTFBlockSize = LittleEndian.getShort(_dataStream,
dataBlockStartOffset +PICT_HEADER_OFFSET);
+    int PICTF1BlockOffset = PICTFBlockSize + PICT_HEADER_OFFSET;
+    int PICTF1BlockSize = LittleEndian.getShort(_dataStream,
dataBlockStartOffset +PICTF1BlockOffset);
+    int unknownHeaderOffset = PICTF1BlockSize + PICTF1BlockOffset;
+    realPicoffset += (unknownHeaderOffset + UNKNOWN_HEADER_SIZE);
+    return realPicoffset;
+  }
+
+  private void fillJPGWidthHeight() {
+    /*
+    http://www.codecomments.com/archive281-2004-3-158083.html
+
+    Algorhitm proposed by Patrick TJ McPhee:
+
+    read 2 bytes
+    make sure they are 'ffd8'x
+    repeatedly:
+    read 2 bytes
+    make sure the first one is 'ff'x
+    if the second one is 'd9'x stop
+    else if the second one is c0 or c2 (or possibly other values ...)
+    skip 2 bytes
+    read one byte into depth
+    read two bytes into height
+    read two bytes into width
+    else
+    read two bytes into length
+    skip forward length-2 bytes
+
+    Also used Ruby code snippet from:
http://www.bigbold.com/snippets/posts/show/805 for reference
+    */
+    int pointer = pictureBytesStartOffset+2;
+    int firstByte = _dataStream[pointer];
+    int secondByte = _dataStream[pointer+1];
+
+    int endOfPicture = pictureBytesStartOffset + size;
+    while(pointer<endOfPicture-1) {
+      do {
+	 firstByte = _dataStream[pointer];
+	 secondByte = _dataStream[pointer+1];
+      } while (!(firstByte==(byte)0xFF) && pointer<endOfPicture-1);
+
+      if (firstByte==((byte)0xFF) && pointer<endOfPicture-1) {
+	 if (secondByte==(byte)0xD9 || secondByte==(byte)0xDA) {
+	   break;
+	 } else if ( (secondByte & 0xF0) == 0xC0 && secondByte!=(byte)0xC4 &&
secondByte!=(byte)0xC8 && secondByte!=(byte)0xCC) {
+	   pointer += 5;
+	   this.height = getBigEndianShort(_dataStream, pointer);
+	   this.width = getBigEndianShort(_dataStream, pointer+2);
+	   break;
+	 } else {
+	   pointer++;
+	   pointer++;
+	   int length = getBigEndianShort(_dataStream, pointer);
+	   pointer+=length;
+	 }
+      } else {
+	 pointer++;
+      }
+    }
+  }
+
+  private void fillPNGWidthHeight()
+  {
+    /*
+     Used PNG file format description from
http://www.wotsit.org/download.asp?f=png
+    */
+    int HEADER_START = pictureBytesStartOffset + PNG.length + 4;
+    if (matchSignature(_dataStream, IHDR, HEADER_START)) {
+      int IHDR_CHUNK_WIDTH = HEADER_START + 4;
+      this.width = getBigEndianInt(_dataStream, IHDR_CHUNK_WIDTH);
+      this.height = getBigEndianInt(_dataStream, IHDR_CHUNK_WIDTH + 4);
+    }
+  }
+  /**
+   * returns pixel width of the picture or -1 if dimensions determining was
failed
+   */
+  public int getWidth()
+  {
+    return width;
+  }
+  /**
+   * returns pixel height of the picture or -1 if dimensions determining was
failed
+   */
+  public int getHeight()
+  {
+    return height;
+  }
+
+  private static int getBigEndianInt(byte[] data, int offset)
+  {
+    return (((data[offset] & 0xFF)<< 24) + ((data[offset +1] & 0xFF) <<
16) +
((data[offset + 2] & 0xFF) << 8) + (data[offset +3] & 0xFF));
+  }
+
+  private static int getBigEndianShort(byte[] data, int offset)
+  {
+    return (((data[offset] & 0xFF)<< 8) + (data[offset +1] & 0xFF));
+  }
+
+}
Index:
D:/java/svn-apache/src/scratchpad/testcases/org/apache/poi/hwpf/data/testPictur
es.doc
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream

Property changes on:
D:\java\svn-apache\src\scratchpad\testcases\org\apache\poi\hwpf\data\testPictur
es.doc
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Index:
D:/java/svn-apache/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestPictu
resTable.java
===================================================================
---
D:/java/svn-apache/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestPictu
resTable.java	     (revision 0)
+++
D:/java/svn-apache/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestPictu
resTable.java	     (revision 0)
@@ -0,0 +1,71 @@
+/* ====================================================================
+   Copyright 2002-2006   Apache Software Foundation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.model;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.usermodel.Picture;
+import junit.framework.TestCase;
+
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.util.List;
+
+/**
+ * @author Dmitry Romanov
+ * @version $Id: $
+ */
+public class TestPicturesTable
+  extends TestCase
+{
+  private HWPFDocument document;
+  private String testPath;
+
+
+  public TestPicturesTable(String string)
+  {
+    super(string);
+  }
+
+  protected void setUp() throws Exception
+  {
+    testPath = System.getProperty("HWPF.testdata.path");
+    if (testPath == null)
+    {
+      testPath = "c:";
+    }
+    String testFile = testPath + "/testPictures.doc";
+    document = new HWPFDocument(new FileInputStream(testFile));
+  }
+
+  public void testGetAllPictures() throws Exception {
+    PicturesTable picturesTable = document.getPicturesTable();
+    List allPictures = picturesTable.getAllPictures();
+    assertNotNull(allPictures);
+    assertTrue(allPictures.size() >= 5 );
+    for (int i = 0; i < allPictures.size(); i++)
+    {
+      Picture picture = (Picture) allPictures.get(i);
+      System.out.println(picture.suggestFullFileName()+":
"+picture.getSize()+" bytes"+" width:
"+picture.getWidth()+"("+picture.getAspectRatioX()+
+	       "%) height:
"+picture.getHeight()+"("+picture.getAspectRatioY()+"%)");
+
+      FileOutputStream out = new
FileOutputStream(testPath+"/"+picture.suggestFullFileName());
+      out.write(picture.getContent());
+      out.close();
+    }
+  }
+
+}


-- 
Configure bugmail: http://issues.apache.org/bugzilla/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are the assignee for the bug, or are watching the assignee.

---------------------------------------------------------------------
To unsubscribe, e-mail: poi-dev-unsubscribe@jakarta.apache.org
Mailing List:    http://jakarta.apache.org/site/mail2.html#poi
The Apache Jakarta POI Project: http://jakarta.apache.org/poi/


Mime
View raw message