Splitting Large Records
Algolia doesn’t provide support for WordPress or any WordPress plugin.
Algolia has a limit of 10 KB per record for performance and relevance reasons. Instead of indexing large pieces of text into a single record, you can split them into multiple records and use the distinct
feature to deduplicate results at query time.
Building an HTML splitter
The following HTML splitter creates one record per <h2>
heading and includes as many paragraphs as possible. You create a new record every time there’s a new <h2>
, or when you’ve reached the content limit.
It also stores every <h3>
heading into an array. This is helpful for relevance, as you can set headings before content in searchableAttributes
to give them more weight.
You can add all splitters into a splitters.php
file, and require it from your main plugin file.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
namespace Algolia;
use DOMDocument;
class HtmlSplitter
{
protected $level1 = 'h2';
protected $level2 = 'h3';
protected $contentLimit = 1000;
/**
* Splits the given value.
*
* @param object $searchable
* @param string $value
*
* @return array
*/
public function split(\WP_Post $post) {
$dom = new DOMDocument();
$dom->loadHTML( $this->get_sanitized_content($post) );
$rootNodes = $dom->getElementsByTagName('body')->item(0)->childNodes;
$values = $split = [];
foreach($rootNodes as $node) {
$values[] = [$node->tagName => $this->get_node_content($node)];
}
$current = [];
foreach ($values as $entry) {
foreach ($entry as $tag => $value) {
if ($tag == $this->level1) {
$split[] = $current;
$current = [
'subtitle' => $value,
'subtitle-2' => [],
'content' => [],
];
} elseif ($tag == $this->level2) {
$current['subtitle-2'][] = $value;
} else {
$current['content'][] = $value;
}
if (!empty($current['content']) && $this->is_content_large_enough($current['content'])) {
$split[] = $current;
$current = [
'subtitle' => '',
'subtitle-2' => [],
'content' => [],
];
}
}
}
foreach ($split as $key => $piece) {
$split[$key]['content'] = implode("\n\n", $piece['content']);
}
return $split;
}
private function get_sanitized_content(\WP_Post $post) {
$the_content = apply_filters('the_content', $post->post_content);
// Remove <script> tags
$the_content = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $the_content);
// Remove \n characters
$the_content = preg_replace('/\n/', '', $the_content);
return $the_content;
}
private function get_node_content(\DOMElement $node) {
if (in_array($node->tagName , ['ul', 'ol'])) {
$text = [];
foreach ($node->childNodes as $li) {
$text[] = $li->nodeValue;
}
return ' - '.implode("\n - ", $text);
}
return $node->textContent;
}
private function is_content_large_enough($content) {
if (is_array($content)) {
$content = implode(' ', $content);
}
return mb_strlen($content, 'UTF-8') > $this->contentLimit;
}
}
Splitting records
The following example shows how to edit the end of the algolia_post_to_record
function in the theme’s functions.php
file.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
function algolia_post_to_record(WP_Post $post) {
$tags = array_map(function (WP_Term $term) {
return $term->name;
}, wp_get_post_terms($post->ID, 'post_tag'));
// Prepare all common attributes and add a new `distinct_key` property
$common = [
'distinct_key' => implode('#', [$post->post_type, $post->ID]),
'title' => $post->post_title,
'author' => [
'id' => $post->post_author,
'name' => get_user_by( 'ID', $post->post_author )->display_name,
],
'excerpt' => $post->post_excerpt,
'content' => strip_tags($post->post_content),
'tags' => $tags,
'url' => get_post_permalink($post->ID),
];
// Split the records on the `post_content` attribute
$splitter = new \Algolia\HtmlSplitter;
$records = $splitter->split($post);
// Merge the common attributes into each split and add a unique `objectID`
foreach ($records as $key => $split) {
$records[$key] = array_merge($common, $split, [
'objectID' => implode('-', [$post->post_type, $post->ID, $key]),
]);
}
return $records;
}
add_filter('post_to_record', 'algolia_post_to_record');
Make sure to set the distinct_key
property in attributesForFaceting
in your Algolia settings so you can use it with distinct
.
Note that instead of returning a single record, the function returned a records list. You need to update the reindex_post
command to take this into account.
1
2
3
4
5
6
7
8
9
10
foreach ($posts->posts as $post) {
if ($assoc_args['verbose']) {
WP_CLI::line('Serializing ['.$post->post_title.']');
}
$split = apply_filters('post_to_record', $post);
$records = array_merge($records, $split);
$count++;
}
Automatic update for split records
If you’re indexing long posts and splitting them into multiple records, you need to tweak some things for the automatic update to work.
When updating a post, it could become shorter and take fewer records. This means you need to delete old records for a given post before indexing the new ones. You can delete all records for a given post by using the deleteBy
method on the distinct_key
attribute.
1
$algolia->initIndex('index_name')->deleteBy(['filters' => 'distinct_key:distinct_value']);
Your final algolia_update_post
function, with record splitting support, should look like this:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
function algolia_update_post($id, WP_Post $post, $update) {
if (wp_is_post_revision($id) || wp_is_post_autosave($id)) {
return $post;
}
global $algolia;
$record = (array) apply_filters($post->post_type.'_to_record', $post);
if (!isset($record['objectID'])) {
$record['objectID'] = implode('#', [$post->post_type, $post->ID]);
}
$index = $algolia->initIndex(
apply_filters('algolia_index_name', $post->post_type)
);
// If the post is split, we always delete it
if ($splitRecord = is_split_record($record) ) {
$index->deleteBy(['filters' => 'distinct_key:'.$record['distinct_key']]);
}
if ('trash' == $post->status) {
// If the post was split, it's already deleted
if (!$splitRecord) {
$index->deleteObject($record['objectID']);
}
} else {
$index->saveObjects($record);
}
return $post;
}
add_action('save_post', 'algolia_update_post', 10, 3);
function is_split_record($arr) {
// Split records must be an indexed array
return array_keys($arr) == range(0, count($arr) - 1);
}