poi-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Kevin Roast" <kevin.ro...@alfresco.org>
Subject FW: Problem extracing properties from Unicode/UTF-8 codepage files + patch to fix
Date Thu, 15 Jun 2006 09:58:50 GMT
---
W:\poi-src-2.5.1\original_src\java\org\apache\poi\hpsf\TypeReader.java
Sun Feb 22 10:54:46 2004
+++ TypeReader.java	Wed Jun 14 18:51:26 2006
@@ -45,8 +45,7 @@
      *
      * @see Variant
      */
-    public static Object read(final byte[] src, int offset, int length,
-                              final int type)
+    public static Object read(final byte[] src, int offset, int length,
final int type, int codepage)
     {
         /*
          * FIXME: Support reading more types and clean up this code!
@@ -101,18 +100,55 @@
                  * Read a byte string. In Java it is represented as a
                  * String object. The 0x00 bytes at the end must be
                  * stripped.
-                 *
-                 * FIXME: Reading an 8-bit string should pay attention
-                 * to the codepage. Currently the byte making out the
-                 * property's value are interpreted according to the
-                 * platform's default character set.
+                 * 
+                 * Reading an 8-bit string should pay attention
+                 * to the codepage.
                  */
                 final int first = offset + LittleEndian.INT_SIZE;
                 long last = first + LittleEndian.getUInt(src, offset) -
1;
+                long l = last - first;
                 offset += LittleEndian.INT_SIZE;
-                while (src[(int) last] == 0 && first <= last)
-                    last--;
-                value = new String(src, (int) first, (int) (last -
first + 1));
+                switch (codepage)
+                {
+                    case Property.CP_UNICODE:
+                    {
+                        StringBuffer b = new StringBuffer((int) (last -
first));
+                        for (int i = 0; i <= l; i++)
+                        {
+                            final int i1 = offset + (i * 2);
+                            final int i2 = i1 + 1;
+                            b.append((char) ((src[i2] << 8) +
src[i1]));
+                        }
+                        /* Strip 0x00 characters from the end of the
string: */
+                        while (b.charAt(b.length() - 1) == 0x00)
+                            b.setLength(b.length() - 1);
+                        value = b.toString();
+                    }
+                    break;
+                    
+                    case Property.CP_UTF8:
+                    {
+                        try
+                        {
+                            value = new String(src, (int)first, (int)l,
"UTF-8");
+                        }
+                        catch (java.io.UnsupportedEncodingException
err)
+                        {
+                            while (src[(int) last] == 0 && first <=
last)
+                            last--;
+                            value = new String(src, (int)first,
(int)(last - first + 1));
+                        }
+                    }    
+                    break;
+                    
+                    default:
+                    {
+                        while (src[(int) last] == 0 && first <= last)
+                            last--;
+                        value = new String(src, (int)first, (int)(last
- first + 1));
+                    }
+                    break;
+                }
                 break;
             }
             case Variant.VT_LPWSTR:




--- W:\poi-src-2.5.1\original_src\java\org\apache\poi\hpsf\Property.java
Sun Feb 22 10:54:46 2004
+++ Property.java	Thu Jun 15 09:11:43 2006
@@ -48,9 +48,11 @@
  */
 public class Property
 {
-
-    /* Codepage 1200 denotes Unicode. */
-    private static int CP_UNICODE = 1200;
+    /* Codepage 1200 denotes little-endian Unicode. */
+    public static final int CP_UNICODE = 1200;
+    
+    /* Codepage 65001 denotes UTF-8 Unicode. */
+    public static final int CP_UTF8 = 65001;
 
     private int id;
 
@@ -65,11 +67,8 @@
         return id;
     }
 
-
-
     private long type;
 
-
     /**
      * <p>Returns the property's type.</p>
      *
@@ -80,11 +79,8 @@
         return type;
     }
 
-
-
     private Object value;
 
-
     /**
      * <p>Returns the property's value.</p>
      *
@@ -95,8 +91,6 @@
         return value;
     }
 
-
-
     /**
      * <p>Creates a {@link Property} instance by reading its bytes
      * from the property set stream.</p>
@@ -130,7 +124,7 @@
 
 	try
 	{
-	    value = TypeReader.read(src, o, length, (int) type);
+	    value = TypeReader.read(src, o, length, (int) type,
codepage);
 	}
 	catch (Throwable t)
 	{ 



________________________________

From: Kevin Roast [mailto:kevin.roast@alfresco.org] 
Sent: 15 June 2006 10:53
To: poi-dev@jakarta.apache.org
Subject: Problem extracing properties from Unicode/UTF-8 codepage files
+ patch to fix



Hello, 

We are successfully using the POI library for meta-data extraction in
the Alfresco open-source ECM project. 

A problem has been reported to us that properties such as Title,
Description and Author fail to get read correctly if the codepage used
in the office doc is Unicode or UTF-8. I have patched the code to
support these code pages during property reads. We are then able to read
properties in languages such as Japanese, Arabic, Cyrlic and Greek etc.

Find attached the .patch files against POI version 2.5.1. 

Thanks, 

Kevin 
-- 
http://www.alfresco.org <http://www.alfresco.org>  

<<TypeReader.patch>> <<Property.patch>> 


---------------------------------------------------------------------
To unsubscribe, e-mail: poi-dev-unsubscribe@jakarta.apache.org
Mailing List:    http://jakarta.apache.org/site/mail2.html#poi
The Apache Jakarta POI Project: http://jakarta.apache.org/poi/


Mime
View raw message