flink-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From StephanEwen <...@git.apache.org>
Subject [GitHub] incubator-flink pull request: Serialized String comparison, Unicod...
Date Mon, 01 Sep 2014 15:40:38 GMT
Github user StephanEwen commented on the pull request:

    https://github.com/apache/incubator-flink/pull/4#issuecomment-54073097
  
    Here is a micro benchmark of String Serialization I did recently. It is extremely simple
and performs well. Do you have any feeling how it compares to 
    
    The basic trick is not to use the varlength encodings, but represent the binary variant
the same way as object variant.
    
    ```java
    public class StringSortingBenchmark {
    
    	private static final long SEED = 0xa2e2223fc6643bbl;
    	
    	private static final int MIN_LEN = 4;
    	private static final int MAX_LEN = 213;
    	
    	private static final sun.misc.Unsafe UNSAFE = MemoryUtils.UNSAFE;
    	private static final long BASE_OFFSET = UNSAFE.arrayBaseOffset(byte[].class);
    	
    	public static long datasink = 0; // side effect variable to prevent optimizing loops
away
    	
    	public static void main(String[] args) {
    		// we run comparisons between elements in arrays in the pattern quickstart does is:
pointers coming from both ends
    		
    		final int NUM = 2000000; // 2 million
    
    		final String[] elements = new String[NUM];
    		
    		final byte[] bytes = new byte[Integer.MAX_VALUE / 2];
    		final int[] pointers = new int[NUM];
    		
    		System.out.println("Creating the strings");
    		{
    			Random rnd = new Random(SEED);
    			
    			int ptr = 0;
    			
    			for (int i = 0; i < NUM; i++) {
    				String str = getRandomString(rnd);
    				elements[i] = str;
    				
    				pointers[i] = ptr;
    				ptr += serializeString(bytes, ptr, str);
    			}
    		}
    		
    		System.out.println("Verifying serialization and comparison methods");
    		{
    			int a = 0;
    			int b = NUM - 1;
    			
    			while (a < NUM) {				
    				String deser = deserializeString(bytes, pointers[a]);
    				if (!elements[a].equals(deser)) {
    					throw new RuntimeException("Wrong serialization: " + elements[a] + " versus " + deser);
    				}
    				
    				int cmp1 = elements[a].compareTo(elements[b]);
    				int cmp2 = compareDeserializing(bytes, pointers[a], pointers[b]);
    				int cmp3 = compareBinaryStrings(bytes, pointers[a], pointers[b]);
    				
    				if (Math.signum(cmp1) != Math.signum(cmp2)) {
    					throw new RuntimeException("Wrong comparison result between deserialized and original");
    				}
    				
    				if (Math.signum(cmp1) != Math.signum(cmp3)) {
    					throw new RuntimeException("Wrong comparison result between binary and original");
    				}
    
    				a++;
    				b--;
    			}
    		}
    		
    		long objectsTime;
    		long deserializingTime;
    		long binaryTime;
    		
    		System.out.println("Running experiments with string objects.");
    		{
    			int a = 0;
    			int b = NUM - 1;
    			
    			long acc = 0;  // use this so no compiler optimizes the loop away
    			long start = System.nanoTime();
    		
    			while (a < NUM) {
    				long cmp = elements[a++].compareTo(elements[b--]);
    				acc += cmp;
    			}
    			
    			objectsTime = System.nanoTime() - start;
    			datasink += acc;
    		}
    		
    		System.out.println("Running experiments deserializing string objects.");
    		{
    			int a = 0;
    			int b = NUM - 1;
    			
    			long acc = 0;  // use this so no compiler optimizes the loop away
    			long start = System.nanoTime();
    		
    			while (a < NUM) {
    				long cmp = compareDeserializing(bytes, pointers[a++], pointers[b--]);
    				acc += cmp;
    			}
    			
    			deserializingTime = System.nanoTime() - start;
    			datasink += acc;
    		}
    		
    		System.out.println("Running experiments with binary comparisons.");
    		{
    			int a = 0;
    			int b = NUM - 1;
    			
    			long acc = 0;  // use this so no compiler optimizes the loop away
    			long start = System.nanoTime();
    		
    			while (a < NUM) {
    				long cmp = compareBinaryStrings(bytes, pointers[a++], pointers[b--]);
    				acc += cmp;
    			}
    			
    			binaryTime = System.nanoTime() - start;	
    			datasink += acc;
    		}
    		
    		System.out.println(String.format("Time taken\n   - String objects: %d µsecs\n   - String
deserialization: %d µsecs\n   - Binary comparison: %d µsecs", objectsTime / 1000, deserializingTime
/ 1000, binaryTime / 1000));
    	}
    	
    	private static String getRandomString(Random rnd) {
    		final int len = rnd.nextInt(MAX_LEN - MIN_LEN + 1) + MIN_LEN;
    		final StringBuilder bld = new StringBuilder(len);
    		
    		for (int i = 0; i < len; i++) {
    			bld.append((char) (rnd.nextInt(100) + 33));
    		}	
    		return bld.toString();
    	}
    	
    	private static final int serializeString(byte[] target, int offset, String str) {
    		final int len = str.length();
    		long pos = offset + BASE_OFFSET;
    		
    		UNSAFE.putInt(target, pos, len);
    		pos += 4;
    		
    		for (int i = 0; i < len; i++, pos += 2) {
    			char towrite = Character.reverseBytes(str.charAt(i));
    			UNSAFE.putChar(target, pos, towrite);
    		}
    		
    		return 4 + 2*len;
    	}
    	
    	private static final String deserializeString(byte[] data, int offset) {
    		long pos = offset + BASE_OFFSET;
    		final int len = UNSAFE.getInt(data, pos);
    		pos += 4;
    		
    		StringBuilder bld = new StringBuilder(len);
    		
    		for (int i = 0; i < len; i++, pos += 2) {
    			char read = UNSAFE.getChar(data, pos);
    			read = Character.reverseBytes(read);
    			bld.append(read);
    		}
    		
    		return bld.toString();
    	}
    	
    	private static final int compareBinaryStrings(byte[] data, int off1, int off2) {	
    		int len1 = UNSAFE.getInt(data, off1 + BASE_OFFSET);
    		int len2 = UNSAFE.getInt(data, off2 + BASE_OFFSET);
    		off1 += 4;
    		off2 += 4;
    		
    		final int binCompLen = 2 * Math.min(len1, len2);
    		int val = 0;
    		
    		// binary comparison
    		for (int pos = 0; pos < binCompLen && (val = (data[off1 + pos] & 0xff)
- (data[off2 + pos] & 0xff)) == 0; pos++);
    		return val != 0 ? val : len1 - len2;
    	}
    	
    	private static final int compareDeserializing(byte[] data, int off1, int off2) {
    		String str1 = deserializeString(data, off1);
    		String str2 = deserializeString(data, off2);
    		return str1.compareTo(str2);
    	}
    }
    ```


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

Mime
View raw message