Integrations / Platforms / WordPress / Splitting Large Records

Splitting Large Records

Splitting large records

For performance and relevance reasons, objects in Algolia should be about a few kilobytes. Large pieces of content need to be split into smaller records by splitting on the largest attribute, typically the content. When searching, you’ll need to use distinct to deduplicate the results.

Algolia is not a relational database, all records are completely independent. In order to properly split a post into multiple records we need to:

  • Split the post_content attribute
  • Duplicate all other attributes
  • Add a distinct_key attribute
  • Ensure objectID is unique

Since objectID must be unique, we add a new attribute distinct_key to be able to find all the records from the same posts.

In this example, we’re going to modify the end of the previous algolia_post_to_record function in the theme’s functions.php.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
function algolia_post_to_record(WP_Post $post) {
    $tags = array_map(function (WP_Term $term) {
        return $term->name;
    }, wp_get_post_terms( $post->ID, 'post_tag' ));

    // Prepare all the common attributes
    // Add a new `distinct_key` (same value as the previous objectID)
    $common = [
        'distinct_key' => implode('#', [$post->post_type, $post->ID]),
        'title' => $post->post_title,
        'author' => [
            'id' => $post->post_author,
            'name' => get_user_by( 'ID', $post->post_author )->display_name,
        ],
        'excerpt' => $post->post_excerpt,
        'content' => strip_tags($post->post_content),
        'tags' => $tags,
        'url' => get_post_permalink($post->ID),
    ];

    // Split the records on the `post_content` attribute
    $splitter = new \Algolia\HtmlSplitter;
    $records = $splitter->split($post);

    // Merge the common attributes into all split child
    // Add a unique objectID
    foreach ($records as $key => $split) {
        $records[$key] = array_merge($common, $split, [
            'objectID' => implode('-', [$post->post_type, $post->ID, $key]),
        ]);
    }

	return $records;
}
add_filter('post_to_record', 'algolia_post_to_record');

Note that instead of returning a record, we returned a list of records. We’ll update the reindex_post command to take this into account.

1
2
3
4
5
6
7
8
9
10
foreach ($posts->posts as $post) {
    if ($assoc_args['verbose']) {
        WP_CLI::line('Serializing ['.$post->post_title.']');
    }

    $split = apply_filters('post_to_record', $post);

    $records = array_merge($records, $split);
    $count++;
}

The following splitter creates one record per H2 title with as many paragraphs as possible. We create a new record every time there is a new H2 title, or enough paragraphs were added. We also store every H3 title into an array. This will help for relevancy by adding title and subtitles higher than the content in the searchableAttributes.

You can add all splitters into `` and require this file from your main plugin files.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
namespace Algolia;

use DOMDocument;

class HtmlSplitter
{
    protected $level1 = 'h2';
    protected $level2 = 'h3';
    protected $contentLimit = 1000;

    /**
     * Splits the given value.
     *
     * @param  object $searchable
     * @param  string $value
     *
     * @return array
     */
    public function split(\WP_Post $post)
    {
        $dom = new DOMDocument();
        $dom->loadHTML( $this->get_sanitized_content($post) );
        $rootNodes = $dom->getElementsByTagName('body')->item(0)->childNodes;
        $values = $split = [];

        foreach($rootNodes as $node) {
            $values[] = [$node->tagName => $this->get_node_content($node)];
        }

        $current = [];
        foreach ($values as $entry) {
            foreach ($entry as $tag => $value) {
                if ($tag == $this->level1) {
                    $split[] = $current;
                    $current = [
                        'subtitle' => $value,
                        'subtitle-2' => [],
                        'content' => [],
                    ];
                } elseif ($tag == $this->level2) {
                    $current['subtitle-2'][] = $value;
                } else {
                    $current['content'][] = $value;
                }

                if (!empty($current['content']) && $this->isContentLargeEnough($current['content'])) {
                    $split[] = $current;
                    $current = [
                        'subtitle' => '',
                        'subtitle-2' => [],
                        'content' => [],
                    ];
                }
            }
        }

        foreach ($split as $key => $piece) {
            $split[$key]['content'] = implode("\n\n", $piece['content']);
        }

        return $split;
    }

    private function get_sanitized_content( \WP_Post $post )
    {
        $the_content = apply_filters('the_content', $post->post_content);

        // Remove <script> tags
        $the_content = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $the_content);
        // Remove \n characters
        $the_content = preg_replace('/\n/', '', $the_content);

        return $the_content;
    }

    private function get_node_content( \DOMElement $node )
    {
        if (in_array($node->tagName , ['ul', 'ol'])) {
            $text = [];
            foreach ($node->childNodes as $li) {
                $text[] = $li->nodeValue;
            }
            return ' - '.implode("\n - ", $text);
        }

        return $node->textContent;
    }

    private function isContentLargeEnough( $content ) {
        if (is_array($content)) {
            $content = implode(' ', $content);
        }

        return mb_strlen($content, 'UTF-8') > $this->contentLimit;
    }
}

Automatic update for split records

If you’re indexing long posts and started to split your Algolia records, you need to do some extra work for the automatic update to work. Because your new post could be shorter than the old post, for example 3 records instead of 4, you need to delete them all before your reindex the three new records.

Similar to deleting a post, you can’t delete the records based on the objectID because there are multiple records for this post.

You can delete all records related to any post by using the deleteBy method.

1
$algolia->initIndex('index_name')->deleteBy(['filters' => 'distinct_key:distinct_value'])

Make sure to set your distinct_key as a filter in the attributesForFaceting.

Your final function, with split record support, will look something like this:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
function algolia_update_post($id, WP_Post $post, $update) {
    if (wp_is_post_revision( $id) || wp_is_post_autosave( $id )) {
        return $post;
    }

    global $algolia;

    $record = (array) apply_filters($post->post_type.'_to_record', $post);

    if (! isset($record['objectID'])) {
        $record['objectID'] = implode('#', [$post->post_type, $post->ID]);
    }

    $index = $algolia->initIndex(
        apply_filters('algolia_index_name', $post->post_type)
    );

    // If the post is split, we always delete it
    // If it was deleted, we're good. It it's update, we'll push the new version
    if ($splitRecord = isSplitRecord($record) ) {
        $index->deleteBy(['filters' => 'distinct_key:'.$record['distinct_key']]);
    }

    if ( 'trash' == $post->status ) {
        // If the post was split, it's already deleted
        if ( ! $splitRecord ) {
            $index->deleteObject($record['objectID']);
        }
    } else {
        $index->saveObjects($record);
    }

    return $post;
}

add_action('save_post', 'algolia_update_post', 10, 3);

function isSplitRecord() {
    // Split records must be an array, not an associative array.
    // If it's an actual records, there is now way you want to use numbers for all your attributes
    return array_keys($arr) == range(0, count($arr) - 1);
}

Did you find this page helpful?