changeset 3817:8f26cc9bff26

significantly improve treesitter performance while editing large files (#4716) * significantly improve treesitter performance while editing large files * Apply stylistic suggestions from code review Co-authored-by: Michael Davis <[email protected]> * use PartialEq and Hash instead of a freestanding function Co-authored-by: Michael Davis <[email protected]>
author Pascal Kuthe <pascal.kuthe@semimod.de>
date Tue, 22 Nov 2022 03:54:22 +0100
parents 06fd2067302d
children 1ddf064dd53e
files Cargo.lock helix-core/Cargo.toml helix-core/src/syntax.rs
diffstat 3 files changed, 107 insertions(+), 41 deletions(-) [+]
line wrap: on
line diff
--- a/Cargo.lock	Mon Nov 21 20:52:23 2022 -0600
+++ b/Cargo.lock	Tue Nov 22 03:54:22 2022 +0100
@@ -14,6 +14,18 @@
 ]
 
 [[package]]
+name = "ahash"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf6ccdb167abbf410dcb915cabd428929d7f6a04980b54a11f26a39f1c7f7107"
+dependencies = [
+ "cfg-if",
+ "getrandom",
+ "once_cell",
+ "version_check",
+]
+
+[[package]]
 name = "aho-corasick"
 version = "0.7.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -400,18 +412,29 @@
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
 dependencies = [
- "ahash",
+ "ahash 0.7.6",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ff8ae62cd3a9102e5637afc8452c55acf3844001bd5374e0b0bd7b6616c038"
+dependencies = [
+ "ahash 0.8.2",
 ]
 
 [[package]]
 name = "helix-core"
 version = "0.6.0"
 dependencies = [
+ "ahash 0.8.2",
  "arc-swap",
  "bitflags",
  "chrono",
  "encoding_rs",
  "etcetera",
+ "hashbrown 0.13.1",
  "helix-loader",
  "log",
  "once_cell",
@@ -1288,7 +1311,7 @@
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c5faade31a542b8b35855fff6e8def199853b2da8da256da52f52f1316ee3137"
 dependencies = [
- "hashbrown",
+ "hashbrown 0.12.3",
  "regex",
 ]
 
--- a/helix-core/Cargo.toml	Mon Nov 21 20:52:23 2022 -0600
+++ b/helix-core/Cargo.toml	Tue Nov 22 03:54:22 2022 +0100
@@ -30,6 +30,8 @@
 arc-swap = "1"
 regex = "1"
 bitflags = "1.3"
+ahash = "0.8.2"
+hashbrown = { version = "0.13.1", features = ["raw"] }
 
 log = "0.4"
 serde = { version = "1.0", features = ["derive"] }
--- a/helix-core/src/syntax.rs	Mon Nov 21 20:52:23 2022 -0600
+++ b/helix-core/src/syntax.rs	Tue Nov 22 03:54:22 2022 +0100
@@ -7,8 +7,10 @@
     Rope, RopeSlice, Tendril,
 };
 
+use ahash::RandomState;
 use arc_swap::{ArcSwap, Guard};
 use bitflags::bitflags;
+use hashbrown::raw::RawTable;
 use slotmap::{DefaultKey as LayerId, HopSlotMap};
 
 use std::{
@@ -16,7 +18,8 @@
     cell::RefCell,
     collections::{HashMap, VecDeque},
     fmt,
-    mem::replace,
+    hash::{Hash, Hasher},
+    mem::{replace, transmute},
     path::Path,
     str::FromStr,
     sync::Arc,
@@ -770,30 +773,38 @@
         // Convert the changeset into tree sitter edits.
         let edits = generate_edits(old_source, changeset);
 
+        // This table allows inverse indexing of `layers`.
+        // That is by hashing a `Layer` you can find
+        // the `LayerId` of an existing equivalent `Layer` in `layers`.
+        //
+        // It is used to determine if a new layer exists for an injection
+        // or if an existing layer needs to be updated.
+        let mut layers_table = RawTable::with_capacity(self.layers.len());
+        let layers_hasher = RandomState::new();
         // Use the edits to update all layers markers
-        if !edits.is_empty() {
-            fn point_add(a: Point, b: Point) -> Point {
-                if b.row > 0 {
-                    Point::new(a.row.saturating_add(b.row), b.column)
-                } else {
-                    Point::new(0, a.column.saturating_add(b.column))
-                }
+        fn point_add(a: Point, b: Point) -> Point {
+            if b.row > 0 {
+                Point::new(a.row.saturating_add(b.row), b.column)
+            } else {
+                Point::new(0, a.column.saturating_add(b.column))
             }
-            fn point_sub(a: Point, b: Point) -> Point {
-                if a.row > b.row {
-                    Point::new(a.row.saturating_sub(b.row), a.column)
-                } else {
-                    Point::new(0, a.column.saturating_sub(b.column))
-                }
+        }
+        fn point_sub(a: Point, b: Point) -> Point {
+            if a.row > b.row {
+                Point::new(a.row.saturating_sub(b.row), a.column)
+            } else {
+                Point::new(0, a.column.saturating_sub(b.column))
+            }
+        }
+
+        for (layer_id, layer) in self.layers.iter_mut() {
+            // The root layer always covers the whole range (0..usize::MAX)
+            if layer.depth == 0 {
+                layer.flags = LayerUpdateFlags::MODIFIED;
+                continue;
             }
 
-            for layer in self.layers.values_mut() {
-                // The root layer always covers the whole range (0..usize::MAX)
-                if layer.depth == 0 {
-                    layer.flags = LayerUpdateFlags::MODIFIED;
-                    continue;
-                }
-
+            if !edits.is_empty() {
                 for range in &mut layer.ranges {
                     // Roughly based on https://github.com/tree-sitter/tree-sitter/blob/ddeaa0c7f534268b35b4f6cb39b52df082754413/lib/src/subtree.c#L691-L720
                     for edit in edits.iter().rev() {
@@ -858,6 +869,12 @@
                     }
                 }
             }
+
+            let hash = layers_hasher.hash_one(layer);
+            // Safety: insert_no_grow is unsafe because it assumes that the table
+            // has enough capacity to hold additional elements.
+            // This is always the case as we reserved enough capacity above.
+            unsafe { layers_table.insert_no_grow(hash, layer_id) };
         }
 
         PARSER.with(|ts_parser| {
@@ -982,27 +999,23 @@
                 let depth = layer.depth + 1;
                 // TODO: can't inline this since matches borrows self.layers
                 for (config, ranges) in injections {
-                    // Find an existing layer
-                    let layer = self
-                        .layers
-                        .iter_mut()
-                        .find(|(_, layer)| {
-                            layer.depth == depth && // TODO: track parent id instead
-                            layer.config.language == config.language && layer.ranges == ranges
+                    let new_layer = LanguageLayer {
+                        tree: None,
+                        config,
+                        depth,
+                        ranges,
+                        flags: LayerUpdateFlags::empty(),
+                    };
+
+                    // Find an identical existing layer
+                    let layer = layers_table
+                        .get(layers_hasher.hash_one(&new_layer), |&it| {
+                            self.layers[it] == new_layer
                         })
-                        .map(|(id, _layer)| id);
+                        .copied();
 
                     // ...or insert a new one.
-                    let layer_id = layer.unwrap_or_else(|| {
-                        self.layers.insert(LanguageLayer {
-                            tree: None,
-                            config,
-                            depth,
-                            ranges,
-                            // set the modified flag to ensure the layer is parsed
-                            flags: LayerUpdateFlags::empty(),
-                        })
-                    });
+                    let layer_id = layer.unwrap_or_else(|| self.layers.insert(new_layer));
 
                     queue.push_back(layer_id);
                 }
@@ -1139,6 +1152,34 @@
     flags: LayerUpdateFlags,
 }
 
+/// This PartialEq implementation only checks if that
+/// two layers are theoretically identical (meaning they highlight the same text range with the same language).
+/// It does not check whether the layers have the same internal treesitter
+/// state.
+impl PartialEq for LanguageLayer {
+    fn eq(&self, other: &Self) -> bool {
+        self.depth == other.depth
+            && self.config.language == other.config.language
+            && self.ranges == other.ranges
+    }
+}
+
+/// Hash implementation belongs to PartialEq implementation above.
+/// See its documentation for details.
+impl Hash for LanguageLayer {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.depth.hash(state);
+        // The transmute is necessary here because tree_sitter::Language does not derive Hash at the moment.
+        // However it does use #[repr] transparent so the transmute here is safe
+        // as `Language` (which `Grammar` is an alias for) is just a newtype wrapper around a (thin) pointer.
+        // This is also compatible with the PartialEq implementation of language
+        // as that is just a pointer comparison.
+        let language: *const () = unsafe { transmute(self.config.language) };
+        language.hash(state);
+        self.ranges.hash(state);
+    }
+}
+
 impl LanguageLayer {
     pub fn tree(&self) -> &Tree {
         // TODO: no unwrap