Integrations / Platforms / WordPress / Splitting Large Records

Splitting Large Records

Algolia has a limit of 10 KB per record for performance and relevance reasons. Instead of indexing large pieces of text into a single record, you can split them into multiple records and use the distinct feature to deduplicate results at query time.

Building an HTML splitter

The following HTML splitter creates one record per <h2> heading and includes as many paragraphs as possible. You create a new record every time there’s a new <h2>, or when you’ve reached the content limit.

It also stores every <h3> heading into an array. This is helpful for relevance, as you can set headings before content in searchableAttributes to give them more weight.

You can add all splitters into a splitters.php file, and require it from your main plugin file.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
namespace Algolia;

use DOMDocument;

class HtmlSplitter
{
    protected $level1 = 'h2';
    protected $level2 = 'h3';
    protected $contentLimit = 1000;

    /**
     * Splits the given value.
     *
     * @param  object $searchable
     * @param  string $value
     *
     * @return array
     */
    public function split(\WP_Post $post) {
        $dom = new DOMDocument();
        $dom->loadHTML( $this->get_sanitized_content($post) );
        $rootNodes = $dom->getElementsByTagName('body')->item(0)->childNodes;
        $values = $split = [];

        foreach($rootNodes as $node) {
            $values[] = [$node->tagName => $this->get_node_content($node)];
        }

        $current = [];

        foreach ($values as $entry) {
            foreach ($entry as $tag => $value) {
                if ($tag == $this->level1) {
                    $split[] = $current;
                    $current = [
                        'subtitle' => $value,
                        'subtitle-2' => [],
                        'content' => [],
                    ];
                } elseif ($tag == $this->level2) {
                    $current['subtitle-2'][] = $value;
                } else {
                    $current['content'][] = $value;
                }

                if (!empty($current['content']) && $this->isContentLargeEnough($current['content'])) {
                    $split[] = $current;
                    $current = [
                        'subtitle' => '',
                        'subtitle-2' => [],
                        'content' => [],
                    ];
                }
            }
        }

        foreach ($split as $key => $piece) {
            $split[$key]['content'] = implode("\n\n", $piece['content']);
        }

        return $split;
    }

    private function get_sanitized_content(\WP_Post $post) {
        $the_content = apply_filters('the_content', $post->post_content);

        // Remove <script> tags
        $the_content = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $the_content);
        // Remove \n characters
        $the_content = preg_replace('/\n/', '', $the_content);

        return $the_content;
    }

    private function get_node_content(\DOMElement $node) {
        if (in_array($node->tagName , ['ul', 'ol'])) {
            $text = [];
            foreach ($node->childNodes as $li) {
                $text[] = $li->nodeValue;
            }
            return ' - '.implode("\n - ", $text);
        }

        return $node->textContent;
    }

    private function isContentLargeEnough($content) {
        if (is_array($content)) {
            $content = implode(' ', $content);
        }

        return mb_strlen($content, 'UTF-8') > $this->contentLimit;
    }
}

Splitting records

In this example, we modify the end of the algolia_post_to_record function in the theme’s functions.php file.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
function algolia_post_to_record(WP_Post $post) {
    $tags = array_map(function (WP_Term $term) {
        return $term->name;
    }, wp_get_post_terms($post->ID, 'post_tag'));

    // Prepare all common attributes and add a new `distinct_key` property
    $common = [
        'distinct_key' => implode('#', [$post->post_type, $post->ID]),
        'title' => $post->post_title,
        'author' => [
            'id' => $post->post_author,
            'name' => get_user_by( 'ID', $post->post_author )->display_name,
        ],
        'excerpt' => $post->post_excerpt,
        'content' => strip_tags($post->post_content),
        'tags' => $tags,
        'url' => get_post_permalink($post->ID),
    ];

    // Split the records on the `post_content` attribute
    $splitter = new \Algolia\HtmlSplitter;
    $records = $splitter->split($post);

    // Merge the common attributes into each split and add a unique `objectID`
    foreach ($records as $key => $split) {
        $records[$key] = array_merge($common, $split, [
            'objectID' => implode('-', [$post->post_type, $post->ID, $key]),
        ]);
    }

    return $records;
}
add_filter('post_to_record', 'algolia_post_to_record');

Make sure to set the distinct_key property in attributesForFaceting in your Algolia settings so you can use it with distinct.

Note that instead of returning a single record, we returned a records list. You need to update the reindex_post command to take this into account.

1
2
3
4
5
6
7
8
9
10
foreach ($posts->posts as $post) {
    if ($assoc_args['verbose']) {
        WP_CLI::line('Serializing ['.$post->post_title.']');
    }

    $split = apply_filters('post_to_record', $post);

    $records = array_merge($records, $split);
    $count++;
}

Automatic update for split records

If you’re indexing long posts and splitting them into multiple records, you need to tweak a few things for the automatic update to work.

When updating a post, it can potentially become shorter and take fewer records. This means you need to delete old records for a given post before indexing the new ones. You can delete all records for a given post by using the deleteBy method on the distinct_key attribute.

1
$algolia->initIndex('index_name')->deleteBy(['filters' => 'distinct_key:distinct_value']);

Your final algolia_update_post function, with record splitting support, should look like this:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
function algolia_update_post($id, WP_Post $post, $update) {
    if (wp_is_post_revision($id) || wp_is_post_autosave($id)) {
        return $post;
    }

    global $algolia;

    $record = (array) apply_filters($post->post_type.'_to_record', $post);

    if (!isset($record['objectID'])) {
        $record['objectID'] = implode('#', [$post->post_type, $post->ID]);
    }

    $index = $algolia->initIndex(
        apply_filters('algolia_index_name', $post->post_type)
    );

    // If the post is split, we always delete it
    if ($splitRecord = isSplitRecord($record) ) {
        $index->deleteBy(['filters' => 'distinct_key:'.$record['distinct_key']]);
    }

    if ('trash' == $post->status) {
        // If the post was split, it's already deleted
        if (!$splitRecord) {
            $index->deleteObject($record['objectID']);
        }
    } else {
        $index->saveObjects($record);
    }

    return $post;
}

add_action('save_post', 'algolia_update_post', 10, 3);

function isSplitRecord() {
    // Split records must be an indexed array
    return array_keys($arr) == range(0, count($arr) - 1);
}

Did you find this page helpful?