1 | /** Copyright (c) 2010 Scott A. Crosby. <scott@sacrosby.com>
|
---|
2 |
|
---|
3 | This program is free software: you can redistribute it and/or modify
|
---|
4 | it under the terms of the GNU Lesser General Public License as
|
---|
5 | published by the Free Software Foundation, either version 3 of the
|
---|
6 | License, or (at your option) any later version.
|
---|
7 |
|
---|
8 | This program is distributed in the hope that it will be useful,
|
---|
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
11 | GNU General Public License for more details.
|
---|
12 |
|
---|
13 | You should have received a copy of the GNU General Public License
|
---|
14 | along with this program. If not, see <http://www.gnu.org/licenses/>.
|
---|
15 |
|
---|
16 | */
|
---|
17 |
|
---|
18 | package crosby.binary;
|
---|
19 |
|
---|
20 | import java.util.Arrays;
|
---|
21 | import java.util.Comparator;
|
---|
22 | import java.util.HashMap;
|
---|
23 |
|
---|
24 | import com.google.protobuf.ByteString;
|
---|
25 |
|
---|
26 | /**
|
---|
27 | * Class for mapping a set of strings to integers, giving frequently occuring
|
---|
28 | * strings small integers.
|
---|
29 | */
|
---|
30 | public class StringTable {
|
---|
31 | public StringTable() {
|
---|
32 | clear();
|
---|
33 | }
|
---|
34 |
|
---|
35 | private HashMap<String, Integer> counts;
|
---|
36 | private HashMap<String, Integer> stringmap;
|
---|
37 | private String set[];
|
---|
38 |
|
---|
39 | public void incr(String s) {
|
---|
40 | if (counts.containsKey(s)) {
|
---|
41 | counts.put(s, new Integer(counts.get(s).intValue() + 1));
|
---|
42 | } else {
|
---|
43 | counts.put(s, new Integer(1));
|
---|
44 | }
|
---|
45 | }
|
---|
46 |
|
---|
47 | /** After the stringtable has been built, return the offset of a string in it.
|
---|
48 | *
|
---|
49 | * Note, value '0' is reserved for use as a delimiter and will not be returned.
|
---|
50 | * @param s
|
---|
51 | * @return
|
---|
52 | */
|
---|
53 | public int getIndex(String s) {
|
---|
54 | return stringmap.get(s).intValue();
|
---|
55 | }
|
---|
56 |
|
---|
57 | public void finish() {
|
---|
58 | Comparator<String> comparator = new Comparator<String>() {
|
---|
59 | //@Override
|
---|
60 | public int compare(final String s1, String s2) {
|
---|
61 | int diff = counts.get(s2) - counts.get(s1);
|
---|
62 | return diff;
|
---|
63 | }
|
---|
64 | };
|
---|
65 |
|
---|
66 | set = counts.keySet().toArray(new String[0]);
|
---|
67 | if (set.length > 0) {
|
---|
68 | // Sort based on the frequency.
|
---|
69 | Arrays.sort(set, comparator);
|
---|
70 | // Each group of keys that serializes to the same number of bytes is
|
---|
71 | // sorted lexiconographically.
|
---|
72 | // to maximize deflate compression.
|
---|
73 |
|
---|
74 | // Don't sort the first array. There's not likely to be much benefit, and we want frequent values to be small.
|
---|
75 | //Arrays.sort(set, Math.min(0, set.length-1), Math.min(1 << 7, set.length-1));
|
---|
76 |
|
---|
77 | Arrays.sort(set, Math.min(1 << 7, set.length-1), Math.min(1 << 14,
|
---|
78 | set.length-1));
|
---|
79 | Arrays.sort(set, Math.min(1 << 14, set.length-1), Math.min(1 << 21,
|
---|
80 | set.length-1), comparator);
|
---|
81 | }
|
---|
82 | stringmap = new HashMap<String, Integer>(2 * set.length);
|
---|
83 | for (int i = 0; i < set.length; i++) {
|
---|
84 | stringmap.put(set[i], new Integer(i+1)); // Index 0 is reserved for use as a delimiter.
|
---|
85 | }
|
---|
86 | counts = null;
|
---|
87 | }
|
---|
88 |
|
---|
89 | public void clear() {
|
---|
90 | counts = new HashMap<String, Integer>(100);
|
---|
91 | stringmap = null;
|
---|
92 | set = null;
|
---|
93 | }
|
---|
94 |
|
---|
95 | public Osmformat.StringTable.Builder serialize() {
|
---|
96 | Osmformat.StringTable.Builder builder = Osmformat.StringTable
|
---|
97 | .newBuilder();
|
---|
98 | builder.addS(ByteString.copyFromUtf8("")); // Add a unused string at offset 0 which is used as a delimiter.
|
---|
99 | for (int i = 0; i < set.length; i++)
|
---|
100 | builder.addS(ByteString.copyFromUtf8(set[i]));
|
---|
101 | return builder;
|
---|
102 | }
|
---|
103 | }
|
---|