...
- Query performance can be improved dramatically in certain situations, particularly when most of the heavily accessed rows of the table are in a single partition or a small number of partitions. The partitioning substitutes for leading columns of indexes, reducing index size and making it more likely that the heavily-used parts of the indexes fit in memory.
- When queries or updates access a large percentage of a single partition, performance can be improved by taking advantage of the sequential scan of that partition instead of using an index and random access reads scattered across the whole table.
- Seldom-used data can be stored in separate partitions and migrated to cheaper and slower storage media.
So the idea is to partition a table by “Field no” and change the table structure at the same time to use only simple types.
MARC leader has a very strict structure defined at https://www.loc.gov/marc/bibliographic/bdleader.html. So, it makes sense to store separately every part of the leader value to support search conditions addressed to the leader and make them simple. This means that a dedicated table should be created to store leader values.
Partitioned table
marc_indexers |
field_no varchar(3) |
ind1 varchar(1) |
ind2 varchar(1) |
subfield_no varchar(1) |
value text |
marc_id uuid |
...
Code Block | ||
---|---|---|
| ||
do $$ declare ind integer; suffix text; begin for ind in 0 .. 999 loop suffix = lpad(ind::text, 3, '0'); execute 'marc_indexers_' || suffix || ' partition of marc_indexers for values in ('''|| suffix ||''');'; end loop; create table marc_indexers_leadercat partition of marc_indexers for values in ('ldrcat'); create table marc_indexers_cat partition of marc_indexers for values in ('cat'); create table marc_indexers_own partition of marc_indexers for values in ('own'); create table marc_indexers_lkr partition of marc_indexers for values in ('lkr'); end; $$; |
...
The main concern here is that apart from standard field numbers and leader some libraries can use their own identifiers as field_no as in the example above. So there must be a solution to handle such cases.
Filling in data.
There are two events when data must be added or changed in that new table.
The first one is the initial data loading once this feature is added. It can be done with a single plpg/sql block that parses and inserts all data for all MARC records.
...
language | sql |
---|
...
Leader table
marc_indexers_leader |
---|
p_00_04 varchar(5) |
p_05 varchar(1) |
p_06 varchar(1) |
p_07 varchar(1) |
p_08 varchar(1) |
p_09 varchar(1) |
p_10 varchar(1) |
p_11 varchar(1) |
p_12_16 varchar(5) |
p_17 varchar(1) |
p_18 varchar(1) |
p_19 varchar(1) |
p_20 varchar(1) |
p_21 varchar(1) |
p_22 varchar(1) |
marc_id uuid |
Code Block | ||
---|---|---|
| ||
create table marc_indexers_leader
(
p_00_04 varchar(5) constraint nn_marc_indexers_leader_p_00_04 not null,
p_05 varchar(1) constraint nn_marc_indexers_leader_p_05 not null,
p_06 varchar(1) constraint nn_marc_indexers_leader_p_06 not null,
p_07 varchar(1) constraint nn_marc_indexers_leader_p_07 not null,
p_08 varchar(1) constraint nn_marc_indexers_leader_p_08 not null,
p_09 varchar(1) constraint nn_marc_indexers_leader_p_09 not null,
p_10 varchar(1) constraint nn_marc_indexers_leader_p_10 not null,
p_11 varchar(1) constraint nn_marc_indexers_leader_p_11 not null,
p_12_16 varchar(5) constraint nn_marc_indexers_leader_p_12_16 not null,
p_17 varchar(1) constraint nn_marc_indexers_leader_p_17 not null,
p_18 varchar(1) constraint nn_marc_indexers_leader_p_18 not null,
p_19 varchar(1) constraint nn_marc_indexers_leader_p_19 not null,
p_20 varchar(1) constraint nn_marc_indexers_leader_p_20 not null,
p_21 varchar(1) constraint nn_marc_indexers_leader_p_21 not null,
p_22 varchar(1) constraint nn_marc_indexers_leader_p_22 not null,
marc_id uuid constraint nn_marc_indexers_marc_id not null
);
|
Filling in data.
There are two events when data must be added or changed in that new table.
The first one is the initial data loading once this feature is added. It can be done with a single plpg/sql block that parses and inserts all data for all MARC records.
Code Block | ||
---|---|---|
| ||
do $$ <<fill_marc_indexers_simple_block>> declare rec record; begin for rec in (select id, content from marc_records_lb x) loop insert into marc_indexers (field_no, ind1, ind2, subfield_no, value, marc_id) (with vals as (select value from jsonb_array_elements(( select value From jsonb_each(rec.content) x where key = 'fields')) y), fields as (select x.key as field_no, x.value as field_value From vals, jsonb_each(vals.value) x), fields_subfields as ( select field_no, trim(field_value ->> 'ind1'::text) ind1, trim(field_value ->> 'ind2') ind2, field_value -> 'subfields' subfields, field_value From fields), marc_raw as ( select fs.field_no, fs.ind1, fs.ind2, fs.field_value, null::text subfield_no, null::text subfield_value from fields_subfields fs where subfields is null union all select fs.field_no, fs.ind1, fs.ind2, fs.field_value, subfs.key::text subfield_no, subfs.value::text subfield_value From fields_subfields fs, jsonb_array_elements(fs.subfields) sx, jsonb_each(sx.value) subfs where subfields is not null), marc as ( select m.field_no, CASE WHEN ind1 IS NULL or ind1 = '' THEN '#' ELSE ind1 END as ind1, CASE WHEN ind2 IS NULL or ind2 = '' THEN '#' ELSE ind2 END as ind2, CASE WHEN subfield_no IS NULL or trim(subfield_no) = '' THEN '0' ELSE subfield_no END as subfield_no, trim(both '"' from coalesce(subfield_value, field_value::text)) as value From marc_raw m) select distinct lower(field_no) field_no, ind1, ind2, subfield_no, value, rec.id marc_id from marc); -- insert into marc_indexers_leader(p_00_04, p_05, p_06, p_07, p_08, p_09, p_10, p_11, p_12_16, p_17, p_18, p_19, p_20, p_21, p_22, marc_id) (select substring(value from 1 for 5) p_00_04, substring(value from 6 for 1) p_05, substring(value from 7 for 1) p_06, substring(value from 8 for 1) p_07, substring(value from 9 for 1) p_08, substring(value from 10 for 1) p_09, substring(value from 11 for 1) p_10, substring(value from 12 for 1) p_11, substring(value from 13 for 5) p_12_16, substring(value from 18 for 1) p_17, subfields, field_value From fields), marc_raw as ( select fs.field_no, fs.ind1, fs.ind2, fs.field_value, null::text subfield_no, null::text subfield_value from fields_subfields fs where subfields is null union all select fs.field_no, fs.ind1, fs.ind2, fs.field_value, subfs.key::text subfield_no, subfs.value::text subfield_value From fields_subfields fs, jsonb_array_elements(fs.subfields) sx, jsonb_each(sx.value) subfs where subfields is not null), marc as ( select m.field_no, CASE WHEN ind1 IS NULL or ind1 = '' THEN '#' ELSE ind1 END substring(value from 19 for 1) p_18, substring(value from 20 for 1) p_19, substring(value from 21 for 1) p_20, substring(value from 22 for 1) p_21, as ind1, CASE WHEN ind2 IS NULL or ind2 = '' THEN '#' ELSE ind2 END substring(value from 23 for 1) p_22, marc_id as ind2, from CASE( WHEN subfield_no IS NULL or trim(subfield_no) = '' THEN '0' ELSE subfield_no END as subfield_no, trim select replace(lower(trim(both '"' from coalesce(subfield_value, field_value::text)), ' ', '#') as value, as value From marc_raw m) select distinct lower(field_no) field_no, ind1, ind2, subfield_no, value, rec.id marc_id from marc union all select 'ldr' field_no, '#' ind1, '#' ind2, '0' subfield_no, trim(both '"' from value::text) as value, rec.id marc_id From jsonb_each( (select content from marc_records_lb where id = rec.id)) x where rec.id marc_id From jsonb_each(rec.content) x where key = 'leader') y); end end loop; end end fill_marc_indexers_simple_block $$; |
...
Code Block | ||
---|---|---|
| ||
create index idx_marc_indexers_marc_id on marc_indexers(marc_id);
create index idx_marc_indexers_001_vals on marc_indexers_001 using gin(regexp_match(value, '\W*((?:[a-zA-Z]*)|(?:[0-9]*))\W*((?:[a-zA-Z]*)|(?:[0-9]*))\W*((?:[a-zA-Z]*)|(?:[0-9]*))\W*') array_ops);
create index idx_marc_indexers_035_vals on marc_indexers_035 using gin(regexp_match(value, '\(*([a-zA-Z]*?)\)*(\w+)') array_ops);
create index idx_marc_indexers_005_vals on marc_indexers_005(value);
create index idx_marc_indexers_999_vals on marc_indexers_999(value);
create index idx_marc_indexers_leader_marc_id on diku_mod_source_record_storage.marc_indexers_leader(marc_id); |
Based on test runs it is not necessary to create indexes for partitions with less than 1.5 million rows because even sequential scans work less than half a second for such partitions.
...