Skip to content

Improve indexed lookup speed of strings when the last keys characters are part of collated contractions #7093

@asfernandes

Description

@asfernandes

This relates to #6915 but without the need to set DISABLE-COMPRESSIONS=1.

Test cases:

create collation UNICODE_CSCZ_CI
   for UTF8  
   from UNICODE  
   case insensitive  
   'LOCALE=cs_CZ'
;
   
create collation UNICODE_CSCZ_CS
   for UTF8  
   from UNICODE  
   case sensitive  
   'LOCALE=cs_CZ'
;

CREATE TABLE TEST1M (
  ANSI_CZ VARCHAR(10)  CHARACTER SET WIN1250 COLLATE PXW_CSY,
  UNICODE_CS_CZ VARCHAR(10)  CHARACTER SET UTF8 COLLATE UNICODE_CSCZ_CS,
  UNICODE_CI_CZ VARCHAR(10) CHARACTER SET UTF8 COLLATE UNICODE_CSCZ_CI,
  UNICODE_CS VARCHAR(10) CHARACTER SET UTF8,
  UNICODE_CI VARCHAR(10) CHARACTER SET UTF8 COLLATE UNICODE_CI
);

CREATE OR ALTER PROCEDURE GetStr(AORDERID BIGINT)
RETURNS (AResult CHAR(10)) AS
declare variable Base36Chars CHAR(36);
declare variable mResult VARCHAR(10);
declare variable ID BIGINT;
declare variable I INT;
BEGIN
    Base36Chars = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ';
    mResult = '';
    AResult = mResult;
    ID = AORDERID;
    WHILE (ID > 0) DO
    BEGIN
      I = MOD(ID, 36);
      ID = ID / 36;
      mResult = mResult || SubString(Base36Chars from I + 1 for 1);
    END
    AResult = LEFT(mResult || '0000000', 7);
  Suspend;
END;

-- Generate test string data
-- 000000, 100000...900000...A00000...Z00000,
-- 010000, 110000...910000...A10000...Z10000,
-- ...

EXECUTE BLOCK
AS
  DECLARE ROWSCOUNT INT = 1000000;
  DECLARE I INT = 0;
  DECLARE C INT = 0;
  DECLARE Str VARCHAR(10);
BEGIN
  WHILE (C < ROWSCOUNT) DO
  BEGIN
    SELECT AResult from GetStr(:I) into :Str;
    -- Skip Y, Z
    IF ((LEFT(Str, 1) <> 'Y') AND (LEFT(Str, 1) <> 'Z')) THEN BEGIN
      INSERT INTO TEST1M(ANSI_CZ, UNICODE_CS_CZ, UNICODE_CI_CZ, UNICODE_CS, UNICODE_CI) VALUES (:Str, :Str, :Str, :Str, :Str);
      C = C + 1;
    END
    I = I + 1;
  END
END;

CREATE INDEX TEST1M_ANSI_CZ ON TEST1M (ANSI_CZ);
CREATE INDEX TEST1M_UNICODE_CS_CZ ON TEST1M (UNICODE_CS_CZ);
CREATE INDEX TEST1M_UNICODE_CI_CZ ON TEST1M (UNICODE_CI_CZ);
CREATE INDEX TEST1M_UNICODE_CS ON TEST1M (UNICODE_CS);
CREATE INDEX TEST1M_UNICODE_CI ON TEST1M (UNICODE_CI);

SELECT COUNT(*) FROM TEST1M;
  -- Time 269ms                           <<<<<<< OK >>>>>>>
  -- Result 1000000

--######################################### Scenario use WHERE >= #########################################
-- Problem only in CZECH collate case insensitive
   
  SELECT ANSI_CZ FROM TEST1M
  WHERE ANSI_CZ >= 'Z'
  ORDER BY ANSI_CZ;
    -- Time: 0ms                          <<<<<<< OK >>>>>>>
    -- Result nothing                     
 
  SELECT UNICODE_CS FROM TEST1M
  WHERE UNICODE_CS >= 'Z'
  ORDER BY UNICODE_CS;
    -- Time: 0ms                          <<<<<<< OK >>>>>>>
    -- Result nothing                     
 
  SELECT UNICODE_CI FROM TEST1M
  WHERE UNICODE_CI >= 'Z'
  ORDER BY UNICODE_CI;
    -- Time: 0ms                          <<<<<<< OK >>>>>>>
    -- Result nothing

  SELECT UNICODE_CS_CZ FROM TEST1M
  WHERE UNICODE_CS_CZ >= 'Z'
  ORDER BY UNICODE_CS_CZ;
    -- Time: 0ms                          <<<<<<< OK >>>>>>>
    -- Result nothing

  SELECT UNICODE_CI_CZ FROM TEST1M        
  WHERE UNICODE_CI_CZ >= 'Z'
  ORDER BY UNICODE_CI_CZ;
    -- Time: 4,294s                       <<<<<<< COLLATE CASE INSENSITIVE HEAR IS PERFORMANCE PROBLEM FOR VALUE "Z" 4,294s WHY? VALUE "Y" IS OK 0ms >>>>>>>
    -- Result nothing

  SELECT UNICODE_CI_CZ FROM TEST1M
  WHERE UNICODE_CI_CZ >= 'Y'
  ORDER BY UNICODE_CI_CZ;
    -- Time: 0ms                          <<<<<<< OK >>>>>>>
    -- Result nothing
 
  SELECT FIRST 1 UNICODE_CI_CZ FROM TEST1M        
  WHERE UNICODE_CI_CZ >= 'C'
  ORDER BY UNICODE_CI_CZ;
    -- Time: 1,531s                       <<<<<<< collate case INSENSITIVE NEXT PERFOMANCE PROBLEM HAVE VALUE "C" 1,531s WHY? VALUE "D" IS OK 0ms >>>>>>>
    -- Result C000000   

  SELECT FIRST 1 UNICODE_CI_CZ FROM TEST1M        
  WHERE UNICODE_CI_CZ >= 'D'
  ORDER BY UNICODE_CI_CZ;
    -- Time: 0ms                          <<<<<<< OK >>>>>>>
    -- Result D000000      
 
--######################################### Scenario use WHERE like #########################################   
-- Problem in CZECH collate case sensitive and insensitive    
  SELECT ANSI_CZ FROM TEST1M
  WHERE ANSI_CZ LIKE 'Z%'
  ORDER BY ANSI_CZ;
    -- Time: 0ms                          <<<<<<< OK >>>>>>>
    -- Result nothing      
                   
  SELECT UNICODE_CS FROM TEST1M
  WHERE UNICODE_CS LIKE 'Z%'
  ORDER BY UNICODE_CS;
    -- Time: 0ms                          <<<<<<< OK >>>>>>>
    -- Result nothing                     

  SELECT UNICODE_CI FROM TEST1M
  WHERE UNICODE_CI LIKE 'Z%'
  ORDER BY UNICODE_CI;
    -- Time: 0ms                          <<<<<<< OK >>>>>>>
    -- Result nothing

  SELECT UNICODE_CS_CZ FROM TEST1M
  WHERE UNICODE_CS_CZ LIKE 'Z%'
  ORDER BY UNICODE_CS_CZ;
    -- Time: 4,247s                       <<<<<<< collate case SENSITIVE HEAR IS PERFOMANCE PROBLEM FOR VALUE "Z" 4,247s WHY? VALUE "Y" IS OK 0ms >>>>>>>
    -- Result nothing

  SELECT UNICODE_CS_CZ FROM TEST1M
  WHERE UNICODE_CS_CZ LIKE 'Y%'
  ORDER BY UNICODE_CS_CZ;
    -- Time: 0ms                          <<<<<<< FOR letter "Y" IS NOT PROBLEM :) WHY? >>>>>>>
    -- Result nothing
 
  SELECT UNICODE_CI_CZ FROM TEST1M        
  WHERE UNICODE_CI_CZ LIKE 'Z%'
  ORDER BY UNICODE_CI_CZ;
    -- Time: 4,52s                        <<<<<<< HEAR IS PERFORMANCE PROBLEM WHY? >>>>>>>
    -- Result nothing

  SELECT UNICODE_CI_CZ FROM TEST1M
  WHERE UNICODE_CI_CZ LIKE 'Y%'
  ORDER BY UNICODE_CI_CZ;
    -- Time: 0ms                          <<<<<<< FOR letter "Y" IS NOT PROBLEM :) WHY? >>>>>>>
    -- Result nothing

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions