Class CollationBuilder

    • Method Summary

      All Methods Static Methods Instance Methods Concrete Methods 
      Modifier and Type Method Description
      private int addIfDifferent​(java.lang.CharSequence prefix, java.lang.CharSequence str, long[] newCEs, int newCEsLength, int ce32)  
      private int addOnlyClosure​(java.lang.CharSequence nfdPrefix, java.lang.CharSequence nfdString, long[] newCEs, int newCEsLength, int ce32)  
      (package private) void addRelation​(int strength, java.lang.CharSequence prefix, java.lang.CharSequence str, java.lang.CharSequence extension)
      Implements CollationRuleParser.Sink.
      (package private) void addReset​(int strength, java.lang.CharSequence str)
      Implements CollationRuleParser.Sink.
      private void addTailComposites​(java.lang.CharSequence nfdPrefix, java.lang.CharSequence nfdString)  
      private int addWithClosure​(java.lang.CharSequence nfdPrefix, java.lang.CharSequence nfdString, long[] newCEs, int newCEsLength, int ce32)
      Adds the mapping and its canonical closure.
      private static int alignWeightRight​(int w)  
      private static int binarySearchForRootPrimaryNode​(int[] rootPrimaryIndexes, int length, long[] nodes, long p)
      Like Java Collections.binarySearch(List, key, Comparator).
      private static int ceStrength​(long ce)  
      private static long changeNodeNextIndex​(long node, int next)  
      private static long changeNodePreviousIndex​(long node, int previous)  
      private void closeOverComposites()  
      private static int countTailoredNodes​(long[] nodesArray, int i, int strength)
      Counts the tailored nodes of the given strength up to the next node which is either stronger or has an explicit weight of this strength.
      private boolean equalSubSequences​(java.lang.CharSequence left, int leftStart, java.lang.CharSequence right, int rightStart)  
      private void finalizeCEs()
      Replaces temporary CEs with the final CEs they point to.
      private int findCommonNode​(int index, int strength)
      Finds the node which implies or contains a common=05 weight of the given strength (secondary or tertiary), if the current node is stronger.
      private int findOrInsertNodeForCEs​(int strength)
      Picks one of the current CEs and finds or inserts a node in the graph for the CE + strength.
      private int findOrInsertNodeForPrimary​(long p)
      Finds or inserts the node for a root CE's primary weight.
      private int findOrInsertNodeForRootCE​(long ce, int strength)  
      private int findOrInsertWeakNode​(int index, int weight16, int level)
      Finds or inserts the node for a secondary or tertiary weight.
      private long getSpecialResetPosition​(java.lang.CharSequence str)  
      private int getWeight16Before​(int index, long node, int level)
      Returns the secondary or tertiary weight preceding the current node's weight.
      private boolean ignorePrefix​(java.lang.CharSequence s)  
      private boolean ignoreString​(java.lang.CharSequence s)  
      private static int indexFromTempCE​(long tempCE)  
      private static int indexFromTempCE32​(int tempCE32)  
      private int insertNodeBetween​(int index, int nextIndex, long node)
      Inserts a new node into the list, between list-adjacent items.
      private int insertTailoredNodeAfter​(int index, int strength)
      Makes and inserts a new tailored node into the list, after the one at index.
      private boolean isFCD​(java.lang.CharSequence s)  
      private static boolean isTailoredNode​(long node)  
      private static boolean isTempCE​(long ce)  
      private static boolean isTempCE32​(int ce32)  
      private void makeTailoredCEs()
      Walks the tailoring graph and overwrites tailored nodes with new CEs.
      private boolean mergeCompositeIntoString​(java.lang.CharSequence nfdString, int indexAfterLastStarter, int composite, java.lang.CharSequence decomp, java.lang.StringBuilder newNFDString, java.lang.StringBuilder newString)  
      private static int nextIndexFromNode​(long node)  
      private static long nodeFromNextIndex​(int next)  
      private static long nodeFromPreviousIndex​(int previous)  
      private static long nodeFromStrength​(int strength)  
      private static long nodeFromWeight16​(int weight16)  
      private static long nodeFromWeight32​(long weight32)  
      private static boolean nodeHasAnyBefore​(long node)  
      private static boolean nodeHasBefore2​(long node)  
      private static boolean nodeHasBefore3​(long node)  
      (package private) void optimize​(UnicodeSet set)
      Implements CollationRuleParser.Sink.
      CollationTailoring parseAndBuild​(java.lang.String ruleString)  
      private static int previousIndexFromNode​(long node)  
      private static boolean sameCEs​(long[] ces1, int ces1Length, long[] ces2, int ces2Length)  
      private void setCaseBits​(java.lang.CharSequence nfdString)  
      private static int strengthFromNode​(long node)  
      private static int strengthFromTempCE​(long tempCE)  
      (package private) void suppressContractions​(UnicodeSet set)
      Implements CollationRuleParser.Sink.
      private static long tempCEFromIndexAndStrength​(int index, int strength)
      Encodes "temporary CE" data into a CE that fits into the CE32 data structure, with 2-byte primary, 1-byte secondary and 6-bit tertiary, with valid CE byte values.
      private static int weight16FromNode​(long node)  
      private static long weight32FromNode​(long node)  
      • Methods inherited from class java.lang.Object

        clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
    • Field Detail

      • kClosureLoopLimit

        private static int kClosureLoopLimit
      • COMPOSITES

        private static final UnicodeSet COMPOSITES
      • MAX_INDEX

        private static final int MAX_INDEX
        At most 1M nodes, limited by the 20 bits in node bit fields.
        See Also:
        Constant Field Values
      • HAS_BEFORE2

        private static final int HAS_BEFORE2
        Node bit 6 is set on a primary node if there are nodes with secondary values below the common secondary weight (05).
        See Also:
        Constant Field Values
      • HAS_BEFORE3

        private static final int HAS_BEFORE3
        Node bit 5 is set on a primary or secondary node if there are nodes with tertiary values below the common tertiary weight (05).
        See Also:
        Constant Field Values
      • IS_TAILORED

        private static final int IS_TAILORED
        Node bit 3 distinguishes a tailored node, which has no weight value, from a node with an explicit (root or default) weight.
        See Also:
        Constant Field Values
      • variableTop

        private long variableTop
      • fastLatinEnabled

        private boolean fastLatinEnabled
      • ces

        private long[] ces
      • cesLength

        private int cesLength
      • rootPrimaryIndexes

        private UVector32 rootPrimaryIndexes
        Indexes of nodes with root primary weights, sorted by primary. Compact form of a TreeMap from root primary to node index. This is a performance optimization for finding reset positions. Without this, we would have to search through the entire nodes list. It also allows storing root primary weights in list head nodes, without previous index, leaving room in root primary nodes for 32-bit primary weights.
      • nodes

        private UVector64 nodes
        Data structure for assigning tailored weights and CEs. Doubly-linked lists of nodes in mostly collation order. Each list starts with a root primary node and ends with a nextIndex of 0. When there are any nodes in the list, then there is always a root primary node at index 0. This allows some code not to have to check explicitly for nextIndex==0. Root primary nodes have 32-bit weights but do not have previous indexes. All other nodes have at most 16-bit weights and do have previous indexes. Nodes with explicit weights store root collator weights, or default weak weights (e.g., secondary 05) for stronger nodes. "Tailored" nodes, with the IS_TAILORED bit set, do not store explicit weights but rather create a difference of a certain strength from the preceding node. A root node is followed by either - a root/default node of the same strength, or - a root/default node of the next-weaker strength, or - a tailored node of the same strength. A node of a given strength normally implies "common" weights on weaker levels. A node with HAS_BEFORE2 must be immediately followed by a secondary node with an explicit below-common weight, then a secondary tailored node, and later an explicit common-secondary node. The below-common weight can be a root weight, or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight or before the lowest root weight. (&[before 2] resets to an explicit secondary node so that the following addRelation(secondary) tailors right after that. If we did not have this node and instead were to reset on the primary node, then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.) If the flag is not set, then there are no explicit secondary nodes with the common or lower weights. Same for HAS_BEFORE3 for tertiary nodes and weights. A node must not have both flags set. Tailored CEs are initially represented in a CollationDataBuilder as temporary CEs which point to stable indexes in this list, and temporary CEs stored in a CollationDataBuilder only point to tailored nodes. A temporary CE in the ces[] array may point to a non-tailored reset-before-position node, until the next relation is added. At the end, the tailored weights are allocated as necessary, then the tailored nodes are replaced with final CEs, and the CollationData is rewritten by replacing temporary CEs with final ones. We cannot simply insert new nodes in the middle of the array because that would invalidate the indexes stored in existing temporary CEs. We need to use a linked graph with stable indexes to existing nodes. A doubly-linked list seems easiest to maintain. Each node is stored as an long, with its fields stored as bit fields. Root primary node: - primary weight: 32 bits 63..32 - reserved/unused/zero: 4 bits 31..28 Weaker root nodes & tailored nodes: - a weight: 16 bits 63..48 + a root or default weight for a non-tailored node + unused/zero for a tailored node - index to the previous node: 20 bits 47..28 All types of nodes: - index to the next node: 20 bits 27..8 + nextIndex=0 in last node per root-primary list - reserved/unused/zero bits: bits 7, 4, 2 - HAS_BEFORE2: bit 6 - HAS_BEFORE3: bit 5 - IS_TAILORED: bit 3 - the difference strength (primary/secondary/tertiary/quaternary): 2 bits 1..0 We could allocate structs with pointers, but we would have to store them in a pointer list so that they can be indexed from temporary CEs, and they would require more memory allocations.
    • Method Detail

      • parseAndBuild

        public CollationTailoring parseAndBuild​(java.lang.String ruleString)
                                         throws java.text.ParseException
        Throws:
        java.text.ParseException
      • addReset

        void addReset​(int strength,
                      java.lang.CharSequence str)
        Implements CollationRuleParser.Sink.
        Specified by:
        addReset in class CollationRuleParser.Sink
      • getWeight16Before

        private int getWeight16Before​(int index,
                                      long node,
                                      int level)
        Returns the secondary or tertiary weight preceding the current node's weight. node=nodes[index].
      • getSpecialResetPosition

        private long getSpecialResetPosition​(java.lang.CharSequence str)
      • addRelation

        void addRelation​(int strength,
                         java.lang.CharSequence prefix,
                         java.lang.CharSequence str,
                         java.lang.CharSequence extension)
        Implements CollationRuleParser.Sink.
        Specified by:
        addRelation in class CollationRuleParser.Sink
      • findOrInsertNodeForCEs

        private int findOrInsertNodeForCEs​(int strength)
        Picks one of the current CEs and finds or inserts a node in the graph for the CE + strength.
      • findOrInsertNodeForRootCE

        private int findOrInsertNodeForRootCE​(long ce,
                                              int strength)
      • binarySearchForRootPrimaryNode

        private static final int binarySearchForRootPrimaryNode​(int[] rootPrimaryIndexes,
                                                                int length,
                                                                long[] nodes,
                                                                long p)
        Like Java Collections.binarySearch(List, key, Comparator).
        Returns:
        the index>=0 where the item was found, or the index<0 for inserting the string at ~index in sorted order (index into rootPrimaryIndexes)
      • findOrInsertNodeForPrimary

        private int findOrInsertNodeForPrimary​(long p)
        Finds or inserts the node for a root CE's primary weight.
      • findOrInsertWeakNode

        private int findOrInsertWeakNode​(int index,
                                         int weight16,
                                         int level)
        Finds or inserts the node for a secondary or tertiary weight.
      • insertTailoredNodeAfter

        private int insertTailoredNodeAfter​(int index,
                                            int strength)
        Makes and inserts a new tailored node into the list, after the one at index. Skips over nodes of weaker strength to maintain collation order ("postpone insertion").
        Returns:
        the new node's index
      • insertNodeBetween

        private int insertNodeBetween​(int index,
                                      int nextIndex,
                                      long node)
        Inserts a new node into the list, between list-adjacent items. The node's previous and next indexes must not be set yet.
        Returns:
        the new node's index
      • findCommonNode

        private int findCommonNode​(int index,
                                   int strength)
        Finds the node which implies or contains a common=05 weight of the given strength (secondary or tertiary), if the current node is stronger. Skips weaker nodes and tailored nodes if the current node is stronger and is followed by an explicit-common-weight node. Always returns the input index if that node is no stronger than the given strength.
      • setCaseBits

        private void setCaseBits​(java.lang.CharSequence nfdString)
      • addWithClosure

        private int addWithClosure​(java.lang.CharSequence nfdPrefix,
                                   java.lang.CharSequence nfdString,
                                   long[] newCEs,
                                   int newCEsLength,
                                   int ce32)
        Adds the mapping and its canonical closure. Takes ce32=dataBuilder.encodeCEs(...) so that the data builder need not re-encode the CEs multiple times.
      • addOnlyClosure

        private int addOnlyClosure​(java.lang.CharSequence nfdPrefix,
                                   java.lang.CharSequence nfdString,
                                   long[] newCEs,
                                   int newCEsLength,
                                   int ce32)
      • addTailComposites

        private void addTailComposites​(java.lang.CharSequence nfdPrefix,
                                       java.lang.CharSequence nfdString)
      • mergeCompositeIntoString

        private boolean mergeCompositeIntoString​(java.lang.CharSequence nfdString,
                                                 int indexAfterLastStarter,
                                                 int composite,
                                                 java.lang.CharSequence decomp,
                                                 java.lang.StringBuilder newNFDString,
                                                 java.lang.StringBuilder newString)
      • equalSubSequences

        private boolean equalSubSequences​(java.lang.CharSequence left,
                                          int leftStart,
                                          java.lang.CharSequence right,
                                          int rightStart)
      • ignorePrefix

        private boolean ignorePrefix​(java.lang.CharSequence s)
      • ignoreString

        private boolean ignoreString​(java.lang.CharSequence s)
      • isFCD

        private boolean isFCD​(java.lang.CharSequence s)
      • closeOverComposites

        private void closeOverComposites()
      • addIfDifferent

        private int addIfDifferent​(java.lang.CharSequence prefix,
                                   java.lang.CharSequence str,
                                   long[] newCEs,
                                   int newCEsLength,
                                   int ce32)
      • sameCEs

        private static boolean sameCEs​(long[] ces1,
                                       int ces1Length,
                                       long[] ces2,
                                       int ces2Length)
      • alignWeightRight

        private static final int alignWeightRight​(int w)
      • makeTailoredCEs

        private void makeTailoredCEs()
        Walks the tailoring graph and overwrites tailored nodes with new CEs. After this, the graph is destroyed. The nodes array can then be used only as a source of tailored CEs.
      • countTailoredNodes

        private static int countTailoredNodes​(long[] nodesArray,
                                              int i,
                                              int strength)
        Counts the tailored nodes of the given strength up to the next node which is either stronger or has an explicit weight of this strength.
      • finalizeCEs

        private void finalizeCEs()
        Replaces temporary CEs with the final CEs they point to.
      • tempCEFromIndexAndStrength

        private static long tempCEFromIndexAndStrength​(int index,
                                                       int strength)
        Encodes "temporary CE" data into a CE that fits into the CE32 data structure, with 2-byte primary, 1-byte secondary and 6-bit tertiary, with valid CE byte values. The index must not exceed 20 bits (0xfffff). The strength must fit into 2 bits (Collator.PRIMARY..Collator.QUATERNARY). Temporary CEs are distinguished from real CEs by their use of secondary weights 06..45 which are otherwise reserved for compressed sort keys. The case bits are unused and available.
      • indexFromTempCE

        private static int indexFromTempCE​(long tempCE)
      • strengthFromTempCE

        private static int strengthFromTempCE​(long tempCE)
      • isTempCE

        private static boolean isTempCE​(long ce)
      • indexFromTempCE32

        private static int indexFromTempCE32​(int tempCE32)
      • isTempCE32

        private static boolean isTempCE32​(int ce32)
      • ceStrength

        private static int ceStrength​(long ce)
      • nodeFromWeight32

        private static long nodeFromWeight32​(long weight32)
      • nodeFromWeight16

        private static long nodeFromWeight16​(int weight16)
      • nodeFromPreviousIndex

        private static long nodeFromPreviousIndex​(int previous)
      • nodeFromNextIndex

        private static long nodeFromNextIndex​(int next)
      • nodeFromStrength

        private static long nodeFromStrength​(int strength)
      • weight32FromNode

        private static long weight32FromNode​(long node)
      • weight16FromNode

        private static int weight16FromNode​(long node)
      • previousIndexFromNode

        private static int previousIndexFromNode​(long node)
      • nextIndexFromNode

        private static int nextIndexFromNode​(long node)
      • strengthFromNode

        private static int strengthFromNode​(long node)
      • nodeHasBefore2

        private static boolean nodeHasBefore2​(long node)
      • nodeHasBefore3

        private static boolean nodeHasBefore3​(long node)
      • nodeHasAnyBefore

        private static boolean nodeHasAnyBefore​(long node)
      • isTailoredNode

        private static boolean isTailoredNode​(long node)
      • changeNodePreviousIndex

        private static long changeNodePreviousIndex​(long node,
                                                    int previous)
      • changeNodeNextIndex

        private static long changeNodeNextIndex​(long node,
                                                int next)