view helix-view/src/handlers/word_index.rs @ 6833:a1c38f138388 draft

(13206) word completion (grafted from 1efa919fb2b9841da497a9f29b492bf4389858f3)
author Michael Davis <mcarsondavis@gmail.com>
date Wed, 14 May 2025 10:23:43 -0400
parents
children
line wrap: on
line source

//! Indexing of words from open buffers.
//!
//! This provides an eventually consistent set of words used in any open buffers. This set is
//! later used for lexical completion.

use std::{borrow::Cow, collections::HashMap, iter, mem, sync::Arc, time::Duration};

use helix_core::{
    chars::char_is_word, fuzzy::fuzzy_match, movement, ChangeSet, Range, Rope, RopeSlice,
};
use helix_event::{register_hook, AsyncHook};
use helix_stdx::rope::RopeSliceExt as _;
use parking_lot::RwLock;
use tokio::{sync::mpsc, time::Instant};

use crate::{
    events::{DocumentDidChange, DocumentDidClose, DocumentDidOpen},
    DocumentId,
};

use super::Handlers;

#[derive(Debug)]
struct Change {
    old_text: Rope,
    text: Rope,
    changes: ChangeSet,
}

#[derive(Debug)]
enum Event {
    Insert(Rope),
    Update(DocumentId, Change),
    Delete(DocumentId, Rope),
}

#[derive(Debug)]
pub struct Handler {
    pub(super) index: WordIndex,
    /// A sender into an async hook which debounces updates to the index.
    hook: mpsc::Sender<Event>,
    /// A sender to a tokio task which coordinates the indexing of documents.
    ///
    /// See [WordIndex::run]. A supervisor-like task is in charge of spawning tasks to update the
    /// index. This ensures that consecutive edits to a document trigger the correct order of
    /// insertions and deletions into the word set.
    coordinator: mpsc::UnboundedSender<Event>,
}

impl Handler {
    pub fn spawn() -> Self {
        let index = WordIndex::default();
        let (tx, rx) = mpsc::unbounded_channel();
        tokio::spawn(index.clone().run(rx));
        Self {
            hook: Hook {
                changes: HashMap::default(),
                coordinator: tx.clone(),
            }
            .spawn(),
            index,
            coordinator: tx,
        }
    }
}

#[derive(Debug)]
struct Hook {
    changes: HashMap<DocumentId, Change>,
    coordinator: mpsc::UnboundedSender<Event>,
}

const DEBOUNCE: Duration = Duration::from_secs(1);

impl AsyncHook for Hook {
    type Event = Event;

    fn handle_event(&mut self, event: Self::Event, timeout: Option<Instant>) -> Option<Instant> {
        match event {
            Event::Insert(_) => unreachable!("inserts are sent to the worker directly"),
            Event::Update(doc, change) => {
                if let Some(pending_change) = self.changes.get_mut(&doc) {
                    // If there is already a change waiting for this document, merge the two
                    // changes together by composing the changesets and saving the new `text`.
                    pending_change.changes =
                        mem::take(&mut pending_change.changes).compose(change.changes);
                    pending_change.text = change.text;
                    Some(Instant::now() + DEBOUNCE)
                } else if !is_changeset_significant(&change.changes) {
                    // If the changeset is fairly large, debounce before updating the index.
                    self.changes.insert(doc, change);
                    Some(Instant::now() + DEBOUNCE)
                } else {
                    // Otherwise if the change is small, queue the update to the index immediately.
                    self.coordinator.send(Event::Update(doc, change)).unwrap();
                    timeout
                }
            }
            Event::Delete(doc, text) => {
                // If there are pending changes that haven't been indexed since the last debounce,
                // forget them and delete the old text.
                if let Some(change) = self.changes.remove(&doc) {
                    self.coordinator
                        .send(Event::Delete(doc, change.old_text))
                        .unwrap();
                } else {
                    self.coordinator.send(Event::Delete(doc, text)).unwrap();
                }
                timeout
            }
        }
    }

    fn finish_debounce(&mut self) {
        for (doc, change) in self.changes.drain() {
            self.coordinator.send(Event::Update(doc, change)).unwrap();
        }
    }
}

/// Minimum number of grapheme clusters required to include a word in the index
const MIN_WORD_GRAPHEMES: usize = 3;
/// Maximum word length allowed (in chars)
const MAX_WORD_LEN: usize = 50;

type Word = helix_stdx::str::TinyBoxedStr;

#[derive(Debug, Default)]
struct WordIndexInner {
    /// Reference counted storage for words.
    ///
    /// Words are very likely to be reused many times. Instead of storing duplicates we keep a
    /// reference count of times a word is used. When the reference count drops to zero the word
    /// is removed from the index.
    words: HashMap<Word, u32>,
}

impl WordIndexInner {
    fn words(&self) -> impl Iterator<Item = &Word> {
        self.words.keys()
    }

    fn insert(&mut self, word: RopeSlice) {
        assert!(word.len_chars() <= MAX_WORD_LEN);
        // The word must be shorter than `TinyBoxedStr::MAX` because it is fewer than 50
        // characters and characters take at most four bytes.
        assert!(word.len_bytes() < Word::MAX_LEN);

        let word: Cow<str> = word.into();
        if let Some(rc) = self.words.get_mut(word.as_ref()) {
            *rc = rc.saturating_add(1);
        } else {
            self.words.insert(word.try_into().unwrap(), 1);
        }
    }

    fn remove(&mut self, word: RopeSlice) {
        let word: Cow<str> = word.into();
        match self.words.get_mut(word.as_ref()) {
            Some(1) => {
                self.words.remove(word.as_ref());
            }
            Some(n) => *n -= 1,
            None => (),
        }
    }
}

#[derive(Debug, Default, Clone)]
pub struct WordIndex {
    inner: Arc<RwLock<WordIndexInner>>,
}

impl WordIndex {
    pub fn matches(&self, pattern: &str) -> Vec<String> {
        let inner = self.inner.read();
        let mut matches = fuzzy_match(pattern, inner.words(), false);
        matches.sort_unstable_by_key(|(_, score)| *score);
        matches
            .into_iter()
            .map(|(word, _)| word.to_string())
            .collect()
    }

    fn add_document(&self, text: &Rope) {
        let words: Vec<_> = words(text.slice(..)).collect();
        let mut inner = self.inner.write();
        for word in words {
            inner.insert(word);
        }
    }

    fn update_document(&self, old_text: &Rope, text: &Rope, changes: &ChangeSet) {
        let mut inserted = Vec::new();
        let mut removed = Vec::new();
        for (old_window, new_window) in changed_windows(old_text.slice(..), text.slice(..), changes)
        {
            inserted.extend(words(new_window));
            removed.extend(words(old_window));
        }

        let mut inner = self.inner.write();
        for word in inserted {
            inner.insert(word);
        }
        for word in removed {
            inner.remove(word);
        }
    }

    fn remove_document(&self, text: &Rope) {
        let words: Vec<_> = words(text.slice(..)).collect();
        let mut inner = self.inner.write();
        for word in words {
            inner.remove(word);
        }
    }

    /// Coordinate the indexing of documents.
    ///
    /// This task wraps a MPSC queue and spawns blocking tasks which update the index. Updates
    /// are applied one-by-one to ensure that changes to the index are **serialized**:
    /// updates to each document must be applied in-order.
    async fn run(self, mut events: mpsc::UnboundedReceiver<Event>) {
        while let Some(event) = events.recv().await {
            let this = self.clone();
            tokio::task::spawn_blocking(move || match event {
                Event::Insert(text) => {
                    this.add_document(&text);
                }
                Event::Update(
                    _doc,
                    Change {
                        old_text,
                        text,
                        changes,
                        ..
                    },
                ) => {
                    this.update_document(&old_text, &text, &changes);
                }
                Event::Delete(_doc, text) => {
                    this.remove_document(&text);
                }
            })
            .await
            .unwrap();
        }
    }
}

fn words(text: RopeSlice) -> impl Iterator<Item = RopeSlice> {
    let mut cursor = Range::point(0);
    if text
        .get_char(cursor.anchor)
        .is_some_and(|ch| !ch.is_whitespace())
    {
        let cursor_word_end = movement::move_next_word_end(text, cursor, 1);
        if cursor_word_end.anchor == 0 {
            cursor = cursor_word_end;
        }
    }

    iter::from_fn(move || {
        while cursor.head <= text.len_chars() {
            let mut word = None;
            if text
                .slice(..cursor.head)
                .graphemes_rev()
                .take(MIN_WORD_GRAPHEMES)
                .take_while(|g| g.chars().all(char_is_word))
                .count()
                == MIN_WORD_GRAPHEMES
            {
                cursor.anchor += text
                    .chars_at(cursor.anchor)
                    .take_while(|&c| !char_is_word(c))
                    .count();
                let slice = cursor.slice(text);
                if slice.len_chars() <= MAX_WORD_LEN {
                    word = Some(slice);
                }
            }
            let head = cursor.head;
            cursor = movement::move_next_word_end(text, cursor, 1);
            if cursor.head == head {
                cursor.head = usize::MAX;
            }
            if word.is_some() {
                return word;
            }
        }
        None
    })
}

/// Finds areas of the old and new texts around each operation in `changes`.
///
/// The window is larger than the changed area and can encompass multiple insert/delete operations
/// if they are grouped closely together.
///
/// The ranges of the old and new text should usually be of different sizes. For example a
/// deletion of "foo" surrounded by large retain sections would give a longer window into the
/// `old_text` and shorter window of `new_text`. Vice-versa for an insertion. A full replacement
/// of a word though would give two slices of the same size.
fn changed_windows<'a>(
    old_text: RopeSlice<'a>,
    new_text: RopeSlice<'a>,
    changes: &'a ChangeSet,
) -> impl Iterator<Item = (RopeSlice<'a>, RopeSlice<'a>)> {
    use helix_core::Operation::*;

    let mut operations = changes.changes().iter().peekable();
    let mut old_pos = 0;
    let mut new_pos = 0;
    iter::from_fn(move || loop {
        let operation = operations.next()?;
        let old_start = old_pos;
        let new_start = new_pos;
        let len = operation.len();
        match operation {
            Retain(_) => {
                old_pos += len;
                new_pos += len;
                continue;
            }
            Insert(_) => new_pos += len,
            Delete(_) => old_pos += len,
        }

        // Scan ahead until a `Retain` is found which would end a window.
        while let Some(o) = operations.next_if(|op| !matches!(op, Retain(n) if *n > MAX_WORD_LEN)) {
            let len = o.len();
            match o {
                Retain(_) => {
                    old_pos += len;
                    new_pos += len;
                }
                Delete(_) => old_pos += len,
                Insert(_) => new_pos += len,
            }
        }

        let old_window = old_start.saturating_sub(MAX_WORD_LEN)
            ..(old_pos + MAX_WORD_LEN).min(old_text.len_chars());
        let new_window = new_start.saturating_sub(MAX_WORD_LEN)
            ..(new_pos + MAX_WORD_LEN).min(new_text.len_chars());

        return Some((old_text.slice(old_window), new_text.slice(new_window)));
    })
}

/// Estimates whether a changeset is significant or small.
fn is_changeset_significant(changes: &ChangeSet) -> bool {
    use helix_core::Operation::*;

    let mut diff = 0;
    for operation in changes.changes() {
        match operation {
            Retain(_) => continue,
            Delete(_) | Insert(_) => diff += operation.len(),
        }
    }

    // This is arbitrary and could be tuned further:
    diff > 1_000
}

pub(crate) fn register_hooks(handlers: &Handlers) {
    let coordinator = handlers.word_index.coordinator.clone();
    register_hook!(move |event: &mut DocumentDidOpen<'_>| {
        let doc = doc!(event.editor, &event.doc);
        if doc.word_completion_enabled() {
            coordinator.send(Event::Insert(doc.text().clone())).unwrap();
        }
        Ok(())
    });

    let tx = handlers.word_index.hook.clone();
    register_hook!(move |event: &mut DocumentDidChange<'_>| {
        if !event.ghost_transaction && event.doc.word_completion_enabled() {
            helix_event::send_blocking(
                &tx,
                Event::Update(
                    event.doc.id(),
                    Change {
                        old_text: event.old_text.clone(),
                        text: event.doc.text().clone(),
                        changes: event.changes.clone(),
                    },
                ),
            );
        }
        Ok(())
    });

    let tx = handlers.word_index.hook.clone();
    register_hook!(move |event: &mut DocumentDidClose<'_>| {
        if event.doc.word_completion_enabled() {
            helix_event::send_blocking(
                &tx,
                Event::Delete(event.doc.id(), event.doc.text().clone()),
            );
        }
        Ok(())
    });
}

#[cfg(test)]
mod tests {
    use std::collections::HashSet;

    use super::*;
    use helix_core::diff::compare_ropes;

    impl WordIndex {
        fn words(&self) -> HashSet<String> {
            let inner = self.inner.read();
            inner.words().map(|w| w.to_string()).collect()
        }
    }

    #[track_caller]
    fn assert_words<I: ToString, T: IntoIterator<Item = I>>(text: &str, expected: T) {
        let text = Rope::from_str(text);
        let index = WordIndex::default();
        index.add_document(&text);
        let actual = index.words();
        let expected: HashSet<_> = expected.into_iter().map(|i| i.to_string()).collect();
        assert_eq!(expected, actual);
    }

    #[test]
    fn parse() {
        assert_words("one two three", ["one", "two", "three"]);
        assert_words("a foo c", ["foo"]);
    }

    #[track_caller]
    fn assert_diff<S, R, I>(before: &str, after: &str, expect_removed: R, expect_inserted: I)
    where
        S: ToString,
        R: IntoIterator<Item = S>,
        I: IntoIterator<Item = S>,
    {
        let before = Rope::from_str(before);
        let after = Rope::from_str(after);
        let diff = compare_ropes(&before, &after);
        let expect_removed: HashSet<_> =
            expect_removed.into_iter().map(|i| i.to_string()).collect();
        let expect_inserted: HashSet<_> =
            expect_inserted.into_iter().map(|i| i.to_string()).collect();

        let index = WordIndex::default();
        index.add_document(&before);
        let words_before = index.words();
        index.update_document(&before, &after, diff.changes());
        let words_after = index.words();

        let actual_removed = words_before.difference(&words_after).cloned().collect();
        let actual_inserted = words_after.difference(&words_before).cloned().collect();

        eprintln!("\"{before}\" {words_before:?} => \"{after}\" {words_after:?}");
        assert_eq!(
            expect_removed, actual_removed,
            "expected {expect_removed:?} to be removed, instead {actual_removed:?} was"
        );
        assert_eq!(
            expect_inserted, actual_inserted,
            "expected {expect_inserted:?} to be inserted, instead {actual_inserted:?} was"
        );
    }

    #[test]
    fn diff() {
        assert_diff("one two three", "one five three", ["two"], ["five"]);
        assert_diff("one two three", "one to three", ["two"], []);
        assert_diff("one two three", "one three", ["two"], []);
        assert_diff("one two three", "one t{o three", ["two"], []);
        assert_diff("one foo three", "one fooo three", ["foo"], ["fooo"]);

        // TODO: further testing. Consider setting the max word size smaller in tests.
    }
}