This is page 4 of 10. Use http://codebase.md/moisnx/arc?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .clang-format ├── .config │ └── arceditor │ ├── config.yaml │ ├── keybinds.conf │ └── themes │ ├── catppuccin-mocha.theme │ ├── cyberpunk-neon.theme │ ├── default.theme │ ├── dracula.theme │ ├── github_dark.theme │ ├── gruvbox_dark.theme │ ├── gruvbox_light.theme │ ├── high_constrast_dark.theme │ ├── monokai.theme │ ├── onedark.theme │ ├── solarized_dark.theme │ ├── solarized_light.theme │ ├── tokyo_night.theme │ └── vscode_light.theme ├── .github │ └── assets │ └── screenshot.gif ├── .gitignore ├── .gitmessage ├── .gitmodules ├── build.md ├── CMakeLists.txt ├── deps │ └── tree-sitter-markdown │ ├── .editorconfig │ ├── .gitattributes │ ├── .github │ │ ├── screenshot.png │ │ └── workflows │ │ ├── ci.yml │ │ ├── publish.yml │ │ └── release.yml │ ├── .gitignore │ ├── binding.gyp │ ├── bindings │ │ ├── go │ │ │ ├── binding_test.go │ │ │ ├── markdown_inline.go │ │ │ └── markdown.go │ │ ├── node │ │ │ ├── binding_test.js │ │ │ ├── binding.cc │ │ │ ├── index.d.ts │ │ │ ├── index.js │ │ │ └── inline.js │ │ ├── python │ │ │ ├── tests │ │ │ │ └── test_binding.py │ │ │ └── tree_sitter_markdown │ │ │ ├── __init__.py │ │ │ ├── __init__.pyi │ │ │ ├── binding.c │ │ │ └── py.typed │ │ ├── rust │ │ │ ├── benchmark.rs │ │ │ ├── build.rs │ │ │ ├── lib.rs │ │ │ └── parser.rs │ │ └── swift │ │ ├── .gitignore │ │ └── TreeSitterMarkdownTests │ │ └── TreeSitterMarkdownTests.swift │ ├── Cargo.toml │ ├── CMakeLists.txt │ ├── common │ │ ├── common.js │ │ ├── common.mak │ │ └── html_entities.json │ ├── CONTRIBUTING.md │ ├── go.mod │ ├── LICENSE │ ├── Makefile │ ├── package-lock.json │ ├── package.json │ ├── Package.resolved │ ├── Package.swift │ ├── pyproject.toml │ ├── README.md │ ├── scripts │ │ ├── build.js │ │ └── test.js │ ├── setup.py │ ├── tree-sitter-markdown │ │ ├── bindings │ │ │ ├── c │ │ │ │ ├── tree-sitter-markdown.h │ │ │ │ └── tree-sitter-markdown.pc.in │ │ │ └── swift │ │ │ └── TreeSitterMarkdown │ │ │ └── markdown.h │ │ ├── CMakeLists.txt │ │ ├── grammar.js │ │ ├── Makefile │ │ ├── package.json │ │ ├── queries │ │ │ ├── highlights.scm │ │ │ └── injections.scm │ │ ├── src │ │ │ ├── grammar.json │ │ │ ├── node-types.json │ │ │ ├── parser.c │ │ │ ├── scanner.c │ │ │ └── tree_sitter │ │ │ ├── alloc.h │ │ │ ├── array.h │ │ │ └── parser.h │ │ └── test │ │ └── corpus │ │ ├── extension_minus_metadata.txt │ │ ├── extension_pipe_table.txt │ │ ├── extension_plus_metadata.txt │ │ ├── extension_task_list.txt │ │ ├── failing.txt │ │ ├── issues.txt │ │ └── spec.txt │ ├── tree-sitter-markdown-inline │ │ ├── bindings │ │ │ ├── c │ │ │ │ ├── tree-sitter-markdown-inline.h │ │ │ │ └── tree-sitter-markdown-inline.pc.in │ │ │ └── swift │ │ │ └── TreeSitterMarkdownInline │ │ │ └── markdown_inline.h │ │ ├── CMakeLists.txt │ │ ├── grammar.js │ │ ├── Makefile │ │ ├── package.json │ │ ├── queries │ │ │ ├── highlights.scm │ │ │ └── injections.scm │ │ ├── src │ │ │ ├── grammar.json │ │ │ ├── node-types.json │ │ │ ├── parser.c │ │ │ ├── scanner.c │ │ │ └── tree_sitter │ │ │ ├── alloc.h │ │ │ ├── array.h │ │ │ └── parser.h │ │ └── test │ │ └── corpus │ │ ├── extension_latex.txt │ │ ├── extension_strikethrough.txt │ │ ├── extension_wikilink.txt │ │ ├── failing.txt │ │ ├── issues.txt │ │ ├── spec.txt │ │ └── tags.txt │ └── tree-sitter.json ├── LICENSE ├── Makefile ├── quickstart.md ├── README.md ├── src │ ├── core │ │ ├── buffer.cpp │ │ ├── buffer.h │ │ ├── config_manager.cpp │ │ ├── config_manager.h │ │ ├── editor_delta.h │ │ ├── editor_validation.h │ │ ├── editor.cpp │ │ └── editor.h │ ├── features │ │ ├── markdown_state.h │ │ ├── syntax_config_loader.cpp │ │ ├── syntax_config_loader.h │ │ ├── syntax_highlighter.cpp │ │ └── syntax_highlighter.h │ ├── main.cpp │ └── ui │ ├── input_handler.cpp │ ├── input_handler.h │ ├── renderer.cpp │ ├── renderer.h │ ├── style_manager.cpp │ └── style_manager.h └── treesitter ├── languages.yaml └── queries ├── _javascript │ ├── highlights.scm │ ├── locals.scm │ └── tags.scm ├── _jsx │ ├── highlights.scm │ ├── indents.scm │ └── textobjects.scm ├── _typescript │ ├── highlights.scm │ ├── indents.scm │ ├── locals.scm │ ├── tags.scm │ └── textobjects.scm ├── bash │ ├── highlights.scm │ ├── indents.scm │ ├── injections.scm │ ├── rainbows.scm │ ├── tags.scm │ └── textobjects.scm ├── c │ ├── highlights.scm │ ├── indents.scm │ ├── injections.scm │ ├── locals.scm │ ├── rainbows.scm │ ├── tags.scm │ └── textobjects.scm ├── cpp │ ├── highlights.scm │ ├── indents.scm │ ├── injections.scm │ ├── rainbows.scm │ ├── tags.scm │ └── textobjects.scm ├── css │ ├── highlights.scm │ ├── indents.scm │ ├── injections.scm │ └── rainbows.scm ├── ecma │ ├── highlights.scm │ ├── indents.scm │ ├── injections.scm │ ├── locals.scm │ ├── rainbows.scm │ ├── README.md │ └── textobjects.scm ├── go │ ├── highlights.scm │ ├── indents.scm │ ├── injections.scm │ ├── locals.scm │ ├── rainbows.scm │ ├── tags.scm │ └── textobjects.scm ├── javascript │ ├── highlights.scm │ ├── indents.scm │ ├── injections.scm │ ├── locals.scm │ ├── rainbows.scm │ ├── tags.scm │ └── textobjects.scm ├── markdown │ ├── highlights.scm │ ├── injections.scm │ └── tags.scm ├── markdown.inline │ ├── highlights.scm │ └── injections.scm ├── python │ ├── highlights.scm │ ├── indents.scm │ ├── injections.scm │ ├── locals.scm │ ├── rainbows.scm │ ├── tags.scm │ └── textobjects.scm ├── rust │ ├── highlights.scm │ ├── indents.scm │ ├── injections.scm │ ├── locals.scm │ ├── rainbows.scm │ ├── tags.scm │ └── textobjects.scm ├── toml │ ├── highlights.scm │ ├── injections.scm │ ├── rainbows.scm │ └── textobjects.scm ├── tsx │ ├── highlights.scm │ ├── indents.scm │ ├── injections.scm │ ├── locals.scm │ ├── rainbows.scm │ ├── tags.scm │ └── textobjects.scm ├── typescript │ ├── highlights.scm │ ├── indents.scm │ ├── injections.scm │ ├── locals.scm │ ├── rainbows.scm │ ├── tags.scm │ └── textobjects.scm ├── yaml │ ├── highlights.scm │ ├── indents.scm │ ├── injections.scm │ ├── rainbows.scm │ └── textobjects.scm └── zig ├── highlights.scm ├── indents.scm ├── injections.scm └── textobjects.scm ``` # Files -------------------------------------------------------------------------------- /src/features/syntax_highlighter.cpp: -------------------------------------------------------------------------------- ```cpp 1 | #include "syntax_highlighter.h" 2 | #include "src/core/config_manager.h" 3 | #include <algorithm> 4 | #include <cstring> 5 | #include <fstream> 6 | #include <sstream> 7 | #ifdef _WIN32 8 | #include <curses.h> 9 | #else 10 | #include <ncurses.h> 11 | #endif 12 | #include <iostream> 13 | 14 | #ifdef TREE_SITTER_ENABLED 15 | #include "language_registry.h" // Auto-generated by CMake 16 | #include "tree_sitter/api.h" 17 | #endif 18 | 19 | SyntaxHighlighter::SyntaxHighlighter() 20 | : config_loader_(std::make_unique<SyntaxConfigLoader>()), 21 | current_language_config_(nullptr), currentLanguage("text") 22 | #ifdef TREE_SITTER_ENABLED 23 | , 24 | parser_(nullptr), tree_(nullptr), current_ts_language_(nullptr), 25 | current_ts_query_(nullptr) 26 | #endif 27 | { 28 | #ifdef TREE_SITTER_ENABLED 29 | initializeTreeSitter(); 30 | #endif 31 | } 32 | 33 | SyntaxHighlighter::~SyntaxHighlighter() 34 | { 35 | #ifdef TREE_SITTER_ENABLED 36 | cleanupTreeSitter(); 37 | #endif 38 | } 39 | 40 | bool SyntaxHighlighter::initialize(const std::string &config_directory) 41 | { 42 | // std::cerr << "=== SyntaxHighlighter::initialize ===\n"; 43 | // std::cerr << "Config directory: " << config_directory << std::endl; 44 | 45 | if (!config_loader_->loadAllLanguageConfigs(config_directory)) 46 | { 47 | std::cerr << "Failed to load language configurations from: " 48 | << config_directory << std::endl; 49 | // Fall back to basic highlighting rules 50 | loadBasicRules(); 51 | return false; 52 | } 53 | ConfigManager::registerReloadCallback( 54 | [this, config_directory]() 55 | { 56 | std::cerr << "Syntax config reload triggered." << std::endl; 57 | // Clear old configs and reload them 58 | config_loader_->language_configs_.clear(); 59 | config_loader_->extension_to_language_.clear(); 60 | 61 | // Reload all config files from the directory 62 | this->config_loader_->loadAllLanguageConfigs( 63 | ConfigManager::getSyntaxRulesDir()); 64 | 65 | // Re-apply the parser for the current file 66 | setLanguage(this->currentLanguage); // Re-set language to pick up new 67 | // rules/queries 68 | // NOTE: Force a full buffer re-highlight/re-parse (e.g., set a flag) 69 | }); 70 | 71 | // std::cout << "Successfully loaded language configurations" << std::endl; 72 | return true; 73 | } 74 | 75 | #ifdef TREE_SITTER_ENABLED 76 | 77 | void SyntaxHighlighter::diagnoseGrammar() const 78 | { 79 | if (!current_ts_language_) 80 | { 81 | std::cerr << "ERROR: No language loaded" << std::endl; 82 | return; 83 | } 84 | 85 | std::cerr << "=== Grammar Diagnostic ===" << std::endl; 86 | std::cerr << "ABI Version: " << ts_language_abi_version(current_ts_language_) 87 | << std::endl; 88 | std::cerr << "Symbol count: " 89 | << ts_language_symbol_count(current_ts_language_) << std::endl; 90 | 91 | // Test a simple parse 92 | const char *test_code = "int x;"; 93 | TSTree *test_tree = ts_parser_parse_string(parser_, nullptr, test_code, 94 | std::strlen(test_code)); 95 | 96 | if (test_tree) 97 | { 98 | TSNode root = ts_tree_root_node(test_tree); 99 | char *tree_string = ts_node_string(root); 100 | std::cerr << "Parse test result: " << tree_string << std::endl; 101 | free(tree_string); 102 | ts_tree_delete(test_tree); 103 | } 104 | else 105 | { 106 | std::cerr << "ERROR: Failed to parse simple test code" << std::endl; 107 | } 108 | 109 | std::cerr << "=== End Diagnostic ===" << std::endl; 110 | } 111 | 112 | #endif 113 | 114 | void SyntaxHighlighter::setLanguage(const std::string &extension) 115 | { 116 | std::string language_name = 117 | config_loader_->getLanguageFromExtension(extension); 118 | 119 | const LanguageConfig *config = 120 | config_loader_->getLanguageConfig(language_name); 121 | 122 | if (config) 123 | { 124 | current_language_config_ = config; 125 | currentLanguage = language_name; 126 | 127 | #ifdef TREE_SITTER_ENABLED 128 | if (!config->parser_name.empty() && parser_) 129 | { 130 | const TSLanguage *ts_language = getLanguageFunction(config->parser_name); 131 | if (ts_language) 132 | { 133 | if (!ts_parser_set_language(parser_, ts_language)) 134 | { 135 | std::cerr << "ERROR: Failed to set language for parser" << std::endl; 136 | loadBasicRules(); 137 | return; 138 | } 139 | current_ts_language_ = ts_language; 140 | 141 | // Clean up old query 142 | if (current_ts_query_) 143 | { 144 | ts_query_delete(current_ts_query_); 145 | current_ts_query_ = nullptr; 146 | } 147 | 148 | // Load and merge all queries 149 | if (!config->queries.empty()) 150 | { 151 | std::string merged_query_source; 152 | 153 | for (const auto &query_path : config->queries) 154 | { 155 | std::ifstream file(query_path); 156 | if (!file.is_open()) 157 | { 158 | std::cerr << "ERROR: Cannot open query file: " << query_path 159 | << std::endl; 160 | continue; 161 | } 162 | 163 | std::stringstream buffer; 164 | buffer << file.rdbuf(); 165 | std::string query_content = buffer.str(); 166 | 167 | if (!query_content.empty()) 168 | { 169 | // Add newline between queries for safety 170 | if (!merged_query_source.empty()) 171 | { 172 | merged_query_source += "\n"; 173 | } 174 | merged_query_source += query_content; 175 | } 176 | } 177 | 178 | // Parse the merged query once 179 | if (!merged_query_source.empty()) 180 | { 181 | uint32_t error_offset; 182 | TSQueryError error_type; 183 | current_ts_query_ = ts_query_new( 184 | current_ts_language_, merged_query_source.c_str(), 185 | merged_query_source.length(), &error_offset, &error_type); 186 | 187 | if (!current_ts_query_) 188 | { 189 | std::cerr << "ERROR: Failed to parse merged query" << std::endl; 190 | std::cerr << " Error offset: " << error_offset << std::endl; 191 | std::cerr << " Error type: " << error_type << std::endl; 192 | 193 | // Show context around error 194 | if (error_offset < merged_query_source.length()) 195 | { 196 | int context_start = std::max(0, (int)error_offset - 50); 197 | int context_end = std::min((int)merged_query_source.length(), 198 | (int)error_offset + 50); 199 | 200 | std::cerr << "Context around error:" << std::endl; 201 | std::cerr << "..." 202 | << merged_query_source.substr( 203 | context_start, context_end - context_start) 204 | << "..." << std::endl; 205 | std::cerr << std::string(error_offset - context_start + 3, ' ') 206 | << "^" << std::endl; 207 | } 208 | } 209 | } 210 | } 211 | } 212 | else 213 | { 214 | std::cerr << "ERROR: No Tree-sitter language function found for: " 215 | << config->parser_name << std::endl; 216 | loadBasicRules(); 217 | } 218 | } 219 | else 220 | { 221 | std::cerr << "Tree-sitter not available or no parser specified, using " 222 | "basic highlighting" 223 | << std::endl; 224 | loadBasicRules(); 225 | } 226 | #else 227 | std::cerr << "Tree-sitter disabled, using basic highlighting" << std::endl; 228 | loadBasicRules(); 229 | #endif 230 | } 231 | else 232 | { 233 | std::cerr << "ERROR: No config found for language: " << language_name 234 | << std::endl; 235 | loadBasicRules(); 236 | currentLanguage = "text"; 237 | current_language_config_ = nullptr; 238 | } 239 | } 240 | 241 | std::vector<ColorSpan> 242 | SyntaxHighlighter::getHighlightSpans(const std::string &line, int lineIndex, 243 | const GapBuffer &buffer) const 244 | { 245 | // Check cache first 246 | auto cache_it = line_cache_.find(lineIndex); 247 | if (cache_it != line_cache_.end()) 248 | { 249 | return cache_it->second; 250 | } 251 | 252 | // Handle Markdown special states 253 | if (currentLanguage == "Markdown" && line_states_.count(lineIndex)) 254 | { 255 | MarkdownState state = line_states_.at(lineIndex); 256 | if (state == MarkdownState::IN_FENCED_CODE_BLOCK) 257 | { 258 | std::vector<ColorSpan> result = { 259 | {0, (int)line.length(), getColorPairValue("MARKDOWN_CODE_BLOCK"), 260 | A_NORMAL, 100}}; 261 | line_cache_[lineIndex] = result; 262 | return result; 263 | } 264 | else if (state == MarkdownState::IN_BLOCKQUOTE) 265 | { 266 | std::vector<ColorSpan> result = { 267 | {0, (int)line.length(), getColorPairValue("MARKDOWN_BLOCKQUOTE"), 268 | A_NORMAL, 90}}; 269 | line_cache_[lineIndex] = result; 270 | return result; 271 | } 272 | } 273 | 274 | std::vector<ColorSpan> result; 275 | 276 | #ifdef TREE_SITTER_ENABLED 277 | // CRITICAL: Do lazy reparse if needed 278 | if (tree_needs_reparse_) 279 | { 280 | const_cast<SyntaxHighlighter *>(this)->updateTree(buffer); 281 | const_cast<SyntaxHighlighter *>(this)->tree_needs_reparse_ = false; 282 | } 283 | 284 | if (current_ts_query_ && tree_) 285 | { 286 | try 287 | { 288 | result = executeTreeSitterQuery(line, lineIndex); 289 | } 290 | catch (const std::exception &e) 291 | { 292 | std::cerr << "Tree-sitter query error on line " << lineIndex << ": " 293 | << e.what() << std::endl; 294 | result = getBasicHighlightSpans(line); 295 | } 296 | } 297 | #endif 298 | 299 | // Fall back to basic highlighting if no Tree-sitter result 300 | if (result.empty()) 301 | { 302 | result = getBasicHighlightSpans(line); 303 | } 304 | 305 | // Cache the result 306 | line_cache_[lineIndex] = result; 307 | return result; 308 | } 309 | void SyntaxHighlighter::updateTreeAfterEdit( 310 | const GapBuffer &buffer, size_t byte_pos, size_t old_byte_len, 311 | size_t new_byte_len, uint32_t start_row, uint32_t start_col, 312 | uint32_t old_end_row, uint32_t old_end_col, uint32_t new_end_row, 313 | uint32_t new_end_col) 314 | { 315 | #ifdef TREE_SITTER_ENABLED 316 | if (!tree_ || !parser_) 317 | return; 318 | 319 | // Apply incremental edit to tree structure 320 | TSInputEdit edit = {.start_byte = (uint32_t)byte_pos, 321 | .old_end_byte = (uint32_t)(byte_pos + old_byte_len), 322 | .new_end_byte = (uint32_t)(byte_pos + new_byte_len), 323 | .start_point = {start_row, start_col}, 324 | .old_end_point = {old_end_row, old_end_col}, 325 | .new_end_point = {new_end_row, new_end_col}}; 326 | 327 | ts_tree_edit(tree_, &edit); 328 | tree_version_++; 329 | 330 | // Mark that tree needs reparsing (will happen on next query) 331 | tree_needs_reparse_ = true; 332 | 333 | // For very large changes, schedule background reparse 334 | if (old_end_row != new_end_row && (new_end_row - old_end_row) > 10) 335 | { 336 | scheduleBackgroundParse(buffer); 337 | } 338 | #endif 339 | } 340 | 341 | void SyntaxHighlighter::invalidateLineCache(int lineNum) 342 | { 343 | line_cache_.erase(lineNum); 344 | } 345 | 346 | void SyntaxHighlighter::bufferChanged(const GapBuffer &buffer) 347 | { 348 | #ifdef TREE_SITTER_ENABLED 349 | if (!parser_ || !current_ts_language_) 350 | return; 351 | 352 | // REMOVED the "optimization" that was skipping reparsing 353 | // If current_buffer_content_ is empty, we MUST reparse 354 | 355 | if (current_buffer_content_.empty()) 356 | { 357 | // Content was cleared - this signals we need full reparse 358 | updateTree(buffer); 359 | } 360 | else if (!tree_) 361 | { 362 | // No tree exists - need initial parse 363 | updateTree(buffer); 364 | } 365 | // If tree exists AND content is valid, incremental edits should have 366 | // already updated it via notifyEdit() 367 | #endif 368 | 369 | if (currentLanguage == "Markdown") 370 | { 371 | updateMarkdownState(buffer); 372 | } 373 | } 374 | 375 | void SyntaxHighlighter::invalidateFromLine(int startLine) 376 | { 377 | // This is for structural changes (insert/delete lines) 378 | // Clear only lines >= startLine, but do it efficiently 379 | 380 | auto it = line_cache_.lower_bound(startLine); 381 | if (it != line_cache_.end()) 382 | { 383 | line_cache_.erase(it, line_cache_.end()); 384 | } 385 | 386 | // Don't clear content cache unless change is massive 387 | // Let incremental edits handle the tree updates 388 | } 389 | 390 | #ifdef TREE_SITTER_ENABLED 391 | bool SyntaxHighlighter::initializeTreeSitter() 392 | { 393 | parser_ = ts_parser_new(); 394 | if (!parser_) 395 | { 396 | std::cerr << "ERROR: Failed to create Tree-sitter parser" << std::endl; 397 | return false; 398 | } 399 | 400 | // Auto-register all languages from generated header 401 | registerAllLanguages(language_registry_); 402 | 403 | // std::cerr << "Tree-sitter initialized with " << language_registry_.size() 404 | // << " language parser(s)" << std::endl; 405 | 406 | return true; 407 | } 408 | 409 | void SyntaxHighlighter::cleanupTreeSitter() 410 | { 411 | // Wait for background thread 412 | while (is_parsing_) 413 | { 414 | std::this_thread::sleep_for(std::chrono::milliseconds(10)); 415 | } 416 | 417 | std::lock_guard<std::mutex> lock(tree_mutex_); // ADD LOCK 418 | 419 | if (current_ts_query_) 420 | { 421 | ts_query_delete(current_ts_query_); 422 | current_ts_query_ = nullptr; 423 | } 424 | 425 | if (tree_) 426 | { 427 | ts_tree_delete(tree_); 428 | tree_ = nullptr; 429 | } 430 | 431 | if (parser_) 432 | { 433 | ts_parser_delete(parser_); 434 | parser_ = nullptr; 435 | } 436 | } 437 | 438 | const TSLanguage * 439 | SyntaxHighlighter::getLanguageFunction(const std::string &parser_name) 440 | { 441 | auto it = language_registry_.find(parser_name); 442 | if (it != language_registry_.end()) 443 | { 444 | return it->second(); // Call the function pointer 445 | } 446 | 447 | // Enhanced error message showing available languages 448 | std::cerr << "WARNING: No Tree-sitter language found for: '" << parser_name 449 | << "'" << std::endl; 450 | std::cerr << " Available languages: "; 451 | bool first = true; 452 | for (const auto &pair : language_registry_) 453 | { 454 | if (!first) 455 | std::cerr << ", "; 456 | std::cerr << pair.first; 457 | first = false; 458 | } 459 | std::cerr << std::endl; 460 | 461 | return nullptr; 462 | } 463 | 464 | TSQuery * 465 | SyntaxHighlighter::loadQueryFromFile(const std::string &query_file_path) 466 | { 467 | std::ifstream file(query_file_path); 468 | if (!file.is_open()) 469 | { 470 | std::cerr << "ERROR: Cannot open query file: " << query_file_path 471 | << std::endl; 472 | return nullptr; 473 | } 474 | 475 | std::stringstream buffer; 476 | buffer << file.rdbuf(); 477 | std::string query_source = buffer.str(); 478 | 479 | if (query_source.empty()) 480 | { 481 | std::cerr << "ERROR: Query file is empty: " << query_file_path << std::endl; 482 | return nullptr; 483 | } 484 | 485 | // Debug: Print the query source around the error offset 486 | // std::cerr << "Query source length: " << query_source.length() << " 487 | // characters" 488 | // << std::endl; 489 | 490 | uint32_t error_offset; 491 | TSQueryError error_type; 492 | TSQuery *query = 493 | ts_query_new(current_ts_language_, query_source.c_str(), 494 | query_source.length(), &error_offset, &error_type); 495 | 496 | if (!query) 497 | { 498 | std::cerr << "ERROR: Failed to parse query file " << query_file_path 499 | << std::endl; 500 | std::cerr << " Error offset: " << error_offset << std::endl; 501 | std::cerr << " Error type: " << error_type; 502 | 503 | // Provide more detailed error information 504 | switch (error_type) 505 | { 506 | case TSQueryErrorNone: 507 | std::cerr << " (None)"; 508 | break; 509 | case TSQueryErrorSyntax: 510 | std::cerr << " (Syntax Error)"; 511 | break; 512 | case TSQueryErrorNodeType: 513 | std::cerr << " (Unknown Node Type)"; 514 | break; 515 | case TSQueryErrorField: 516 | std::cerr << " (Unknown Field)"; 517 | break; 518 | case TSQueryErrorCapture: 519 | std::cerr << " (Unknown Capture)"; 520 | break; 521 | case TSQueryErrorStructure: 522 | std::cerr << " (Invalid Structure)"; 523 | break; 524 | default: 525 | std::cerr << " (Unknown Error)"; 526 | break; 527 | } 528 | std::cerr << std::endl; 529 | 530 | // Show context around error 531 | if (error_offset < query_source.length()) 532 | { 533 | int context_start = std::max(0, (int)error_offset - 50); 534 | int context_end = 535 | std::min((int)query_source.length(), (int)error_offset + 50); 536 | 537 | std::cerr << "Context around error:" << std::endl; 538 | std::cerr << "..." 539 | << query_source.substr(context_start, 540 | context_end - context_start) 541 | << "..." << std::endl; 542 | 543 | // Point to error location 544 | std::cerr << std::string(error_offset - context_start + 3, ' ') << "^" 545 | << std::endl; 546 | } 547 | 548 | return nullptr; 549 | } 550 | 551 | // std::cerr << "Successfully loaded query from: " << query_file_path 552 | // << std::endl; 553 | return query; 554 | } 555 | 556 | void SyntaxHighlighter::notifyEdit(size_t byte_pos, size_t old_byte_len, 557 | size_t new_byte_len, uint32_t start_row, 558 | uint32_t start_col, uint32_t old_end_row, 559 | uint32_t old_end_col, uint32_t new_end_row, 560 | uint32_t new_end_col) 561 | { 562 | #ifdef TREE_SITTER_ENABLED 563 | if (!tree_) 564 | { 565 | return; 566 | } 567 | 568 | TSInputEdit edit = {.start_byte = (uint32_t)byte_pos, 569 | .old_end_byte = (uint32_t)(byte_pos + old_byte_len), 570 | .new_end_byte = (uint32_t)(byte_pos + new_byte_len), 571 | .start_point = {start_row, start_col}, 572 | .old_end_point = {old_end_row, old_end_col}, 573 | .new_end_point = {new_end_row, new_end_col}}; 574 | 575 | ts_tree_edit(tree_, &edit); 576 | 577 | // CRITICAL FIX: Mark that we need to reparse on next access 578 | // This forces updateTree() to be called on next getHighlightSpans() 579 | // current_buffer_content_.clear(); 580 | #endif 581 | } 582 | 583 | void SyntaxHighlighter::invalidateLineRange(int startLine, int endLine) 584 | { 585 | // OPTIMIZATION: Only invalidate affected lines, not entire cache 586 | 587 | // For single-line changes, only clear that line 588 | if (endLine - startLine <= 3) 589 | { 590 | for (int i = startLine; i <= endLine; ++i) 591 | { 592 | line_cache_.erase(i); 593 | line_states_.erase(i); 594 | } 595 | return; 596 | } 597 | 598 | // For multi-line changes, clear from startLine onwards 599 | auto cache_it = line_cache_.lower_bound(startLine); 600 | if (cache_it != line_cache_.end()) 601 | { 602 | line_cache_.erase(cache_it, line_cache_.end()); 603 | } 604 | 605 | auto state_it = line_states_.lower_bound(startLine); 606 | if (state_it != line_states_.end()) 607 | { 608 | line_states_.erase(state_it, line_states_.end()); 609 | } 610 | 611 | // DON'T clear buffer content unless structural change 612 | if (endLine - startLine > 10) 613 | { 614 | current_buffer_content_.clear(); // Force reparse on next access 615 | } 616 | } 617 | 618 | void SyntaxHighlighter::updateTree(const GapBuffer &buffer) 619 | { 620 | #ifdef TREE_SITTER_ENABLED 621 | std::string content; 622 | int lineCount = buffer.getLineCount(); 623 | 624 | // Build line offset cache while building content 625 | line_byte_offsets_.clear(); 626 | line_byte_offsets_.reserve(lineCount + 1); 627 | line_byte_offsets_.push_back(0); // First line starts at byte 0 628 | 629 | for (int i = 0; i < lineCount; i++) 630 | { 631 | if (i > 0) 632 | content += "\n"; 633 | content += buffer.getLine(i); 634 | 635 | // Store the byte offset for the next line 636 | line_byte_offsets_.push_back(content.length()); 637 | } 638 | 639 | if (content.empty()) 640 | { 641 | std::cerr << "WARNING: Attempting to parse empty buffer\n"; 642 | return; 643 | } 644 | 645 | std::lock_guard<std::mutex> lock(tree_mutex_); 646 | current_buffer_content_ = content; 647 | 648 | if (!tree_) 649 | { 650 | tree_ = ts_parser_parse_string(parser_, nullptr, content.c_str(), 651 | content.length()); 652 | } 653 | else 654 | { 655 | TSTree *old_tree = tree_; 656 | tree_ = ts_parser_parse_string(parser_, old_tree, content.c_str(), 657 | content.length()); 658 | if (old_tree && tree_) 659 | { 660 | ts_tree_delete(old_tree); 661 | } 662 | } 663 | 664 | if (!tree_) 665 | { 666 | std::cerr << "ERROR: Failed to parse tree\n"; 667 | } 668 | #endif 669 | } 670 | 671 | void SyntaxHighlighter::markViewportLines(int startLine, int endLine) const 672 | { 673 | priority_lines_.clear(); 674 | for (int i = startLine; i <= endLine; ++i) 675 | { 676 | priority_lines_.insert(i); 677 | } 678 | } 679 | 680 | bool SyntaxHighlighter::isLineHighlighted(int lineIndex) const 681 | { 682 | return line_cache_.find(lineIndex) != line_cache_.end(); 683 | } 684 | 685 | std::vector<ColorSpan> 686 | SyntaxHighlighter::executeTreeSitterQuery(const std::string &line, 687 | int lineNum) const 688 | { 689 | if (!current_ts_query_ || !tree_) 690 | return {}; 691 | 692 | std::lock_guard<std::mutex> lock(tree_mutex_); 693 | std::vector<ColorSpan> spans; 694 | TSQueryCursor *cursor = ts_query_cursor_new(); 695 | TSNode root_node = ts_tree_root_node(tree_); 696 | 697 | int adjusted_line = 698 | is_full_parse_ ? lineNum : (lineNum - viewport_start_line_); 699 | if (adjusted_line < 0 || 700 | adjusted_line >= ts_node_end_point(root_node).row + 1) 701 | { 702 | ts_query_cursor_delete(cursor); 703 | return {}; 704 | } 705 | 706 | // Calculate byte range for current line 707 | uint32_t line_start_byte = 0; 708 | uint32_t line_end_byte = 0; 709 | 710 | std::istringstream content_stream(current_buffer_content_); 711 | std::string content_line; 712 | int current_line = 0; 713 | 714 | while (std::getline(content_stream, content_line) && current_line <= lineNum) 715 | { 716 | if (current_line == lineNum) 717 | { 718 | line_end_byte = line_start_byte + content_line.length(); 719 | break; 720 | } 721 | line_start_byte += content_line.length() + 1; 722 | current_line++; 723 | } 724 | 725 | ts_query_cursor_set_byte_range(cursor, line_start_byte, line_end_byte); 726 | ts_query_cursor_exec(cursor, current_ts_query_, root_node); 727 | 728 | TSQueryMatch match; 729 | while (ts_query_cursor_next_match(cursor, &match)) 730 | { 731 | for (uint32_t i = 0; i < match.capture_count; i++) 732 | { 733 | TSQueryCapture capture = match.captures[i]; 734 | TSNode node = capture.node; 735 | 736 | TSPoint start_point = ts_node_start_point(node); 737 | TSPoint end_point = ts_node_end_point(node); 738 | 739 | // ORIGINAL: Only process captures starting on current line 740 | // Check if this capture affects the current line 741 | if (start_point.row <= (uint32_t)lineNum && 742 | end_point.row >= (uint32_t)lineNum) 743 | { 744 | uint32_t name_length; 745 | const char *capture_name_ptr = ts_query_capture_name_for_id( 746 | current_ts_query_, capture.index, &name_length); 747 | std::string capture_name(capture_name_ptr, name_length); 748 | 749 | int start_col = 750 | (start_point.row == (uint32_t)lineNum) ? start_point.column : 0; 751 | int end_col = (end_point.row == (uint32_t)lineNum) ? end_point.column 752 | : (int)line.length(); 753 | 754 | start_col = std::max(0, std::min(start_col, (int)line.length())); 755 | end_col = std::max(start_col, std::min(end_col, (int)line.length())); 756 | 757 | if (start_col < end_col) 758 | { 759 | int color_pair = getColorPairForCapture(capture_name); 760 | spans.push_back({start_col, end_col, color_pair, 0, 100}); 761 | } 762 | } 763 | } 764 | } 765 | 766 | ts_query_cursor_delete(cursor); 767 | return spans; 768 | } 769 | 770 | int SyntaxHighlighter::getColorPairForCapture( 771 | const std::string &capture_name) const 772 | { 773 | static const std::unordered_map<std::string, std::string> capture_to_color = { 774 | // Keywords 775 | {"keyword", "KEYWORD"}, 776 | {"keyword.control", "KEYWORD"}, 777 | {"keyword.function", "KEYWORD"}, 778 | {"keyword.operator", "KEYWORD"}, 779 | {"keyword.return", "KEYWORD"}, 780 | {"keyword.conditional", "KEYWORD"}, 781 | {"keyword.repeat", "KEYWORD"}, 782 | {"keyword.import", "KEYWORD"}, 783 | {"keyword.exception", "KEYWORD"}, 784 | 785 | // Types 786 | {"type", "TYPE"}, 787 | {"type.builtin", "TYPE"}, 788 | {"type.definition", "TYPE"}, 789 | {"class", "TYPE"}, 790 | {"interface", "TYPE"}, 791 | 792 | // Functions 793 | {"function", "FUNCTION"}, 794 | {"function.call", "FUNCTION"}, 795 | {"function.builtin", "FUNCTION"}, 796 | {"function.method", "FUNCTION"}, 797 | {"method", "FUNCTION"}, 798 | 799 | // Variables & constants 800 | 801 | {"variable", "VARIABLE"}, 802 | {"variable.parameter", "VARIABLE"}, 803 | {"variable.builtin", "CONSTANT"}, 804 | {"variable.member", "VARIABLE"}, 805 | {"constant", "CONSTANT"}, 806 | {"constant.builtin", "CONSTANT"}, 807 | {"parameter", "VARIABLE"}, 808 | 809 | // Literals 810 | {"string", "STRING_LITERAL"}, 811 | {"string_literal", "STRING_LITERAL"}, 812 | {"number", "NUMBER"}, 813 | {"integer", "NUMBER"}, 814 | {"float", "NUMBER"}, 815 | {"boolean", "CONSTANT"}, 816 | 817 | // Comments 818 | {"comment", "COMMENT"}, 819 | 820 | // Operators & punctuation 821 | {"operator", "OPERATOR"}, 822 | {"punctuation", "PUNCTUATION"}, 823 | {"punctuation.bracket", "PUNCTUATION"}, 824 | {"punctuation.delimiter", "PUNCTUATION"}, 825 | 826 | // Specialized 827 | {"namespace", "NAMESPACE"}, 828 | {"property", "PROPERTY"}, 829 | {"field", "PROPERTY"}, 830 | {"attribute", "DECORATOR"}, 831 | {"decorator", "DECORATOR"}, 832 | {"label", "LABEL"}, 833 | {"tag", "LABEL"}, 834 | 835 | // Preprocessor/macro 836 | {"preproc", "MACRO"}, 837 | {"preproc_include", "MACRO"}, 838 | {"preproc_def", "MACRO"}, 839 | {"preproc_call", "MACRO"}, 840 | {"preproc_if", "MACRO"}, 841 | {"preproc_ifdef", "MACRO"}, 842 | {"preproc_ifndef", "MACRO"}, 843 | {"preproc_else", "MACRO"}, 844 | {"preproc_elif", "MACRO"}, 845 | {"preproc_endif", "MACRO"}, 846 | {"macro", "MACRO"}, 847 | 848 | // Markup (Markdown, etc.) 849 | {"markup.heading", "MARKUP_HEADING"}, 850 | {"heading", "MARKUP_HEADING"}, 851 | {"markup.bold", "MARKUP_BOLD"}, 852 | {"markup.italic", "MARKUP_ITALIC"}, 853 | {"emphasis", "MARKUP_ITALIC"}, 854 | {"markup.code", "MARKUP_CODE"}, 855 | {"code", "MARKUP_CODE"}, 856 | {"markup.link", "MARKUP_LINK"}, 857 | {"link_text", "MARKUP_LINK"}, 858 | {"markup.url", "MARKUP_URL"}, 859 | {"link_uri", "MARKUP_URL"}, 860 | {"markup.quote", "MARKUP_BLOCKQUOTE"}, 861 | {"markup.list", "MARKUP_LIST"}, 862 | {"markup.code", "MARKUP_CODE"}, 863 | {"code_fence_content", "MARKUP_CODE_BLOCK"}, 864 | {"code_span", "MARKUP_CODE"}, 865 | 866 | // Markdown structure 867 | {"markup.list", "MARKUP_LIST"}, 868 | {"markup.quote", "MARKUP_BLOCKQUOTE"}, 869 | }; 870 | 871 | auto it = capture_to_color.find(capture_name); 872 | if (it != capture_to_color.end()) 873 | { 874 | return getColorPairValue(it->second); 875 | } 876 | 877 | // Fallback: hierarchical matching 878 | if (capture_name.find("keyword") != std::string::npos) 879 | return getColorPairValue("KEYWORD"); 880 | if (capture_name.find("type") != std::string::npos) 881 | return getColorPairValue("TYPE"); 882 | if (capture_name.find("function") != std::string::npos) 883 | return getColorPairValue("FUNCTION"); 884 | if (capture_name.find("string") != std::string::npos) 885 | return getColorPairValue("STRING_LITERAL"); 886 | if (capture_name.find("comment") != std::string::npos) 887 | return getColorPairValue("COMMENT"); 888 | if (capture_name.find("number") != std::string::npos) 889 | return getColorPairValue("NUMBER"); 890 | if (capture_name.find("constant") != std::string::npos) 891 | return getColorPairValue("CONSTANT"); 892 | 893 | return 0; // Default 894 | } 895 | #endif 896 | 897 | int SyntaxHighlighter::getColorPairValue(const std::string &color_name) const 898 | { 899 | static const std::unordered_map<std::string, int> color_map = { 900 | {"COMMENT", COMMENT}, 901 | {"KEYWORD", KEYWORD}, 902 | {"STRING_LITERAL", STRING_LITERAL}, 903 | {"NUMBER", NUMBER}, 904 | {"FUNCTION", FUNCTION}, 905 | {"VARIABLE", VARIABLE}, 906 | {"TYPE", TYPE}, 907 | {"OPERATOR", OPERATOR}, 908 | {"PUNCTUATION", PUNCTUATION}, 909 | {"CONSTANT", CONSTANT}, 910 | {"NAMESPACE", NAMESPACE}, 911 | {"PROPERTY", PROPERTY}, 912 | {"DECORATOR", DECORATOR}, 913 | {"MACRO", MACRO}, 914 | {"LABEL", LABEL}, 915 | {"MARKUP_HEADING", MARKUP_HEADING}, 916 | {"MARKUP_BOLD", MARKUP_BOLD}, 917 | {"MARKUP_ITALIC", MARKUP_ITALIC}, 918 | {"MARKUP_CODE", MARKUP_CODE}, 919 | {"MARKUP_CODE_BLOCK", MARKUP_CODE_BLOCK}, 920 | {"MARKUP_LINK", MARKUP_LINK}, 921 | {"MARKUP_URL", MARKUP_URL}, 922 | {"MARKUP_LIST", MARKUP_LIST}, 923 | {"MARKUP_BLOCKQUOTE", MARKUP_BLOCKQUOTE}, 924 | {"MARKUP_STRIKETHROUGH", MARKUP_STRIKETHROUGH}, 925 | {"MARKUP_QUOTE", MARKUP_QUOTE}}; 926 | 927 | auto it = color_map.find(color_name); 928 | return (it != color_map.end()) ? it->second : 0; 929 | } 930 | 931 | int SyntaxHighlighter::getAttributeValue( 932 | const std::string &attribute_name) const 933 | { 934 | static const std::unordered_map<std::string, int> attribute_map = { 935 | {"0", 0}, 936 | {"A_BOLD", A_BOLD}, 937 | {"A_DIM", A_DIM}, 938 | {"A_UNDERLINE", A_UNDERLINE}, 939 | {"A_REVERSE", A_REVERSE}}; 940 | 941 | auto it = attribute_map.find(attribute_name); 942 | return (it != attribute_map.end()) ? it->second : 0; 943 | } 944 | 945 | std::vector<ColorSpan> 946 | SyntaxHighlighter::getBasicHighlightSpans(const std::string &line) const 947 | { 948 | std::vector<ColorSpan> spans; 949 | 950 | // Very basic regex-based highlighting as fallback 951 | // Comments (# and //) 952 | size_t comment_pos = line.find('#'); 953 | if (comment_pos == std::string::npos) 954 | { 955 | comment_pos = line.find("//"); 956 | } 957 | if (comment_pos != std::string::npos) 958 | { 959 | spans.push_back({static_cast<int>(comment_pos), 960 | static_cast<int>(line.length()), 961 | getColorPairValue("COMMENT"), 0, 100}); 962 | } 963 | 964 | // Simple string detection (basic) 965 | bool in_string = false; 966 | char string_char = 0; 967 | size_t string_start = 0; 968 | 969 | for (size_t i = 0; i < line.length(); i++) 970 | { 971 | char c = line[i]; 972 | if (!in_string && (c == '"' || c == '\'')) 973 | { 974 | in_string = true; 975 | string_char = c; 976 | string_start = i; 977 | } 978 | else if (in_string && c == string_char && (i == 0 || line[i - 1] != '\\')) 979 | { 980 | spans.push_back({static_cast<int>(string_start), static_cast<int>(i + 1), 981 | getColorPairValue("STRING_LITERAL"), 0, 90}); 982 | in_string = false; 983 | } 984 | } 985 | 986 | return spans; 987 | } 988 | 989 | void SyntaxHighlighter::loadBasicRules() 990 | { 991 | // This is called as a fallback when Tree-sitter is not available 992 | std::cerr << "Loading basic highlighting rules (fallback mode)" << std::endl; 993 | } 994 | 995 | // Markdown state management (unchanged from original) 996 | void SyntaxHighlighter::updateMarkdownState(const GapBuffer &buffer) 997 | { 998 | if (currentLanguage != "Markdown") 999 | { 1000 | line_states_.clear(); 1001 | return; 1002 | } 1003 | 1004 | line_states_.clear(); 1005 | MarkdownState currentState = MarkdownState::DEFAULT; 1006 | 1007 | int lineCount = buffer.getLineCount(); 1008 | for (int i = 0; i < lineCount; ++i) 1009 | { 1010 | std::string line = buffer.getLine(i); 1011 | line_states_[i] = currentState; 1012 | 1013 | if (currentState == MarkdownState::DEFAULT) 1014 | { 1015 | if (line.rfind("```", 0) == 0) 1016 | { 1017 | currentState = MarkdownState::IN_FENCED_CODE_BLOCK; 1018 | } 1019 | else if (line.rfind(">", 0) == 0) 1020 | { 1021 | line_states_[i] = MarkdownState::IN_BLOCKQUOTE; 1022 | } 1023 | } 1024 | else if (currentState == MarkdownState::IN_FENCED_CODE_BLOCK) 1025 | { 1026 | if (line.rfind("```", 0) == 0) 1027 | { 1028 | currentState = MarkdownState::DEFAULT; 1029 | } 1030 | line_states_[i] = MarkdownState::IN_FENCED_CODE_BLOCK; 1031 | } 1032 | } 1033 | } 1034 | 1035 | std::vector<std::string> SyntaxHighlighter::getSupportedExtensions() const 1036 | { 1037 | return {"cpp", "h", "hpp", "c", "py", "md", "txt"}; 1038 | } 1039 | 1040 | void SyntaxHighlighter::debugTreeSitterState() const 1041 | { 1042 | #ifdef TREE_SITTER_ENABLED 1043 | std::cerr << "=== Tree-sitter State Debug ===\n"; 1044 | std::cerr << "Current language: " << currentLanguage << "\n"; 1045 | std::cerr << "Parser: " << (parser_ ? "EXISTS" : "NULL") << "\n"; 1046 | std::cerr << "Tree: " << (tree_ ? "EXISTS" : "NULL") << "\n"; 1047 | std::cerr << "TS Language: " << (current_ts_language_ ? "EXISTS" : "NULL") 1048 | << "\n"; 1049 | std::cerr << "TS Query: " << (current_ts_query_ ? "EXISTS" : "NULL") << "\n"; 1050 | std::cerr << "Buffer content length: " << current_buffer_content_.length() 1051 | << "\n"; 1052 | std::cerr << "Line cache size: " << line_cache_.size() << "\n"; 1053 | 1054 | if (tree_) 1055 | { 1056 | TSNode root = ts_tree_root_node(tree_); 1057 | char *tree_str = ts_node_string(root); 1058 | std::cerr << "Parse tree (truncated): " 1059 | << std::string(tree_str).substr(0, 200) << "...\n"; 1060 | free(tree_str); 1061 | } 1062 | std::cerr << "=== End Debug ===\n"; 1063 | #else 1064 | std::cerr << "Tree-sitter not enabled\n"; 1065 | #endif 1066 | } 1067 | 1068 | void SyntaxHighlighter::parseViewportOnly(const GapBuffer &buffer, 1069 | int targetLine) 1070 | { 1071 | #ifdef TREE_SITTER_ENABLED 1072 | if (!parser_ || !current_ts_language_) 1073 | return; 1074 | 1075 | int startLine = std::max(0, targetLine - 50); 1076 | int endLine = std::min(buffer.getLineCount() - 1, targetLine + 50); 1077 | 1078 | std::string content; 1079 | for (int i = startLine; i <= endLine; i++) 1080 | { 1081 | if (i > startLine) 1082 | content += "\n"; 1083 | content += buffer.getLine(i); 1084 | } 1085 | 1086 | if (content.empty()) 1087 | return; 1088 | 1089 | TSTree *new_tree = ts_parser_parse_string(parser_, nullptr, content.c_str(), 1090 | content.length()); 1091 | 1092 | if (new_tree) 1093 | { 1094 | std::lock_guard<std::mutex> lock(tree_mutex_); // LOCK ADDED 1095 | if (tree_) 1096 | ts_tree_delete(tree_); 1097 | tree_ = new_tree; 1098 | current_buffer_content_ = content; 1099 | viewport_start_line_ = startLine; 1100 | is_full_parse_ = false; 1101 | } 1102 | #endif 1103 | } 1104 | 1105 | void SyntaxHighlighter::scheduleBackgroundParse(const GapBuffer &buffer) 1106 | { 1107 | #ifdef TREE_SITTER_ENABLED 1108 | if (is_parsing_ || !parser_ || !current_ts_language_) 1109 | return; 1110 | 1111 | auto now = std::chrono::steady_clock::now(); 1112 | auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>( 1113 | now - last_parse_time_) 1114 | .count(); 1115 | 1116 | if (elapsed < 500) 1117 | return; 1118 | 1119 | // Copy content BEFORE starting thread 1120 | std::string content; 1121 | int lineCount = buffer.getLineCount(); 1122 | content.reserve(lineCount * 80); 1123 | 1124 | for (int i = 0; i < lineCount; i++) 1125 | { 1126 | if (i > 0) 1127 | content += "\n"; 1128 | content += buffer.getLine(i); 1129 | } 1130 | 1131 | if (content.empty()) 1132 | return; 1133 | 1134 | is_parsing_ = true; 1135 | last_parse_time_ = now; 1136 | 1137 | // NEW: Capture current version 1138 | uint64_t expected_version = tree_version_.load(); 1139 | 1140 | // Create a COPY of parser state to avoid races 1141 | TSParser *temp_parser = ts_parser_new(); 1142 | if (!ts_parser_set_language(temp_parser, current_ts_language_)) 1143 | { 1144 | ts_parser_delete(temp_parser); 1145 | is_parsing_ = false; 1146 | return; 1147 | } 1148 | 1149 | parse_thread_ = std::thread( 1150 | [this, content, temp_parser, expected_version]() mutable 1151 | { 1152 | TSTree *new_tree = ts_parser_parse_string( 1153 | temp_parser, nullptr, content.c_str(), content.length()); 1154 | 1155 | if (new_tree) 1156 | { 1157 | std::lock_guard<std::mutex> lock(tree_mutex_); 1158 | 1159 | // NEW: Only update if no newer edits happened 1160 | if (tree_version_.load() == expected_version) 1161 | { 1162 | TSTree *old_tree = tree_; 1163 | tree_ = new_tree; 1164 | current_buffer_content_ = std::move(content); 1165 | is_full_parse_ = true; 1166 | 1167 | if (old_tree) 1168 | ts_tree_delete(old_tree); 1169 | } 1170 | else 1171 | { 1172 | // Discard stale parse - user has made newer edits 1173 | ts_tree_delete(new_tree); 1174 | } 1175 | } 1176 | 1177 | ts_parser_delete(temp_parser); 1178 | is_parsing_ = false; 1179 | parse_complete_ = true; 1180 | }); 1181 | 1182 | parse_thread_.detach(); 1183 | #endif 1184 | } 1185 | 1186 | void SyntaxHighlighter::forceFullReparse(const GapBuffer &buffer) 1187 | { 1188 | #ifdef TREE_SITTER_ENABLED 1189 | if (!parser_ || !current_ts_language_) 1190 | return; 1191 | 1192 | std::lock_guard<std::mutex> lock(tree_mutex_); 1193 | 1194 | // Build fresh content 1195 | std::string content; 1196 | int lineCount = buffer.getLineCount(); 1197 | 1198 | // Pre-allocate to avoid reallocations 1199 | size_t estimated_size = lineCount * 50; // Rough estimate 1200 | content.reserve(estimated_size); 1201 | 1202 | for (int i = 0; i < lineCount; i++) 1203 | { 1204 | if (i > 0) 1205 | content += "\n"; 1206 | content += buffer.getLine(i); 1207 | } 1208 | 1209 | if (content.empty()) 1210 | { 1211 | std::cerr << "WARNING: Empty buffer in forceFullReparse\n"; 1212 | return; 1213 | } 1214 | 1215 | // OPTIMIZATION: Use the old tree as a reference for faster re-parsing 1216 | TSTree *old_tree = tree_; 1217 | tree_ = ts_parser_parse_string(parser_, old_tree, content.c_str(), 1218 | content.length()); 1219 | 1220 | if (tree_) 1221 | { 1222 | current_buffer_content_ = std::move(content); // Move instead of copy 1223 | is_full_parse_ = true; 1224 | 1225 | // Delete old tree AFTER successful parse 1226 | if (old_tree) 1227 | ts_tree_delete(old_tree); 1228 | } 1229 | else 1230 | { 1231 | std::cerr << "ERROR: Reparse failed, keeping old tree\n"; 1232 | tree_ = old_tree; // Restore old tree 1233 | return; 1234 | } 1235 | #endif 1236 | 1237 | // Clear cache ONLY, don't rebuild markdown state unless necessary 1238 | line_cache_.clear(); 1239 | 1240 | if (currentLanguage == "Markdown") 1241 | { 1242 | updateMarkdownState(buffer); 1243 | } 1244 | } 1245 | 1246 | void SyntaxHighlighter::clearAllCache() 1247 | { 1248 | // Clear ALL cached line highlighting 1249 | line_cache_.clear(); 1250 | 1251 | // Clear line states (for Markdown) 1252 | line_states_.clear(); 1253 | 1254 | // Clear priority lines 1255 | priority_lines_.clear(); 1256 | 1257 | // CRITICAL: Force tree-sitter content to be marked as stale 1258 | current_buffer_content_.clear(); 1259 | 1260 | // Mark that we need a full reparse 1261 | is_full_parse_ = false; 1262 | } ``` -------------------------------------------------------------------------------- /deps/tree-sitter-markdown/tree-sitter-markdown/src/scanner.c: -------------------------------------------------------------------------------- ```cpp 1 | #include "tree_sitter/parser.h" 2 | #include <assert.h> 3 | #include <ctype.h> 4 | #include <string.h> 5 | #include <wchar.h> 6 | #include <wctype.h> 7 | 8 | // For explanation of the tokens see grammar.js 9 | typedef enum { 10 | LINE_ENDING, 11 | SOFT_LINE_ENDING, 12 | BLOCK_CLOSE, 13 | BLOCK_CONTINUATION, 14 | BLOCK_QUOTE_START, 15 | INDENTED_CHUNK_START, 16 | ATX_H1_MARKER, 17 | ATX_H2_MARKER, 18 | ATX_H3_MARKER, 19 | ATX_H4_MARKER, 20 | ATX_H5_MARKER, 21 | ATX_H6_MARKER, 22 | SETEXT_H1_UNDERLINE, 23 | SETEXT_H2_UNDERLINE, 24 | THEMATIC_BREAK, 25 | LIST_MARKER_MINUS, 26 | LIST_MARKER_PLUS, 27 | LIST_MARKER_STAR, 28 | LIST_MARKER_PARENTHESIS, 29 | LIST_MARKER_DOT, 30 | LIST_MARKER_MINUS_DONT_INTERRUPT, 31 | LIST_MARKER_PLUS_DONT_INTERRUPT, 32 | LIST_MARKER_STAR_DONT_INTERRUPT, 33 | LIST_MARKER_PARENTHESIS_DONT_INTERRUPT, 34 | LIST_MARKER_DOT_DONT_INTERRUPT, 35 | FENCED_CODE_BLOCK_START_BACKTICK, 36 | FENCED_CODE_BLOCK_START_TILDE, 37 | BLANK_LINE_START, 38 | FENCED_CODE_BLOCK_END_BACKTICK, 39 | FENCED_CODE_BLOCK_END_TILDE, 40 | HTML_BLOCK_1_START, 41 | HTML_BLOCK_1_END, 42 | HTML_BLOCK_2_START, 43 | HTML_BLOCK_3_START, 44 | HTML_BLOCK_4_START, 45 | HTML_BLOCK_5_START, 46 | HTML_BLOCK_6_START, 47 | HTML_BLOCK_7_START, 48 | CLOSE_BLOCK, 49 | NO_INDENTED_CHUNK, 50 | ERROR, 51 | TRIGGER_ERROR, 52 | TOKEN_EOF, 53 | MINUS_METADATA, 54 | PLUS_METADATA, 55 | PIPE_TABLE_START, 56 | PIPE_TABLE_LINE_ENDING, 57 | } TokenType; 58 | 59 | // Description of a block on the block stack. 60 | // 61 | // LIST_ITEM is a list item with minimal indentation (content begins at indent 62 | // level 2) while LIST_ITEM_MAX_INDENTATION represents a list item with maximal 63 | // indentation without being considered a indented code block. 64 | // 65 | // ANONYMOUS represents any block that whose close is not handled by the 66 | // external s. 67 | typedef enum { 68 | BLOCK_QUOTE, 69 | INDENTED_CODE_BLOCK, 70 | LIST_ITEM, 71 | LIST_ITEM_1_INDENTATION, 72 | LIST_ITEM_2_INDENTATION, 73 | LIST_ITEM_3_INDENTATION, 74 | LIST_ITEM_4_INDENTATION, 75 | LIST_ITEM_5_INDENTATION, 76 | LIST_ITEM_6_INDENTATION, 77 | LIST_ITEM_7_INDENTATION, 78 | LIST_ITEM_8_INDENTATION, 79 | LIST_ITEM_9_INDENTATION, 80 | LIST_ITEM_10_INDENTATION, 81 | LIST_ITEM_11_INDENTATION, 82 | LIST_ITEM_12_INDENTATION, 83 | LIST_ITEM_13_INDENTATION, 84 | LIST_ITEM_14_INDENTATION, 85 | LIST_ITEM_MAX_INDENTATION, 86 | FENCED_CODE_BLOCK, 87 | ANONYMOUS, 88 | } Block; 89 | 90 | // Determines if a character is punctuation as defined by the markdown spec. 91 | static bool is_punctuation(char chr) { 92 | return (chr >= '!' && chr <= '/') || (chr >= ':' && chr <= '@') || 93 | (chr >= '[' && chr <= '`') || (chr >= '{' && chr <= '~'); 94 | } 95 | 96 | // Returns the indentation level which lines of a list item should have at 97 | // minimum. Should only be called with blocks for which `is_list_item` returns 98 | // true. 99 | static uint8_t list_item_indentation(Block block) { 100 | return (uint8_t)(block - LIST_ITEM + 2); 101 | } 102 | 103 | #define NUM_HTML_TAG_NAMES_RULE_1 3 104 | 105 | static const char *const HTML_TAG_NAMES_RULE_1[NUM_HTML_TAG_NAMES_RULE_1] = { 106 | "pre", "script", "style"}; 107 | 108 | #define NUM_HTML_TAG_NAMES_RULE_7 62 109 | 110 | static const char *const HTML_TAG_NAMES_RULE_7[NUM_HTML_TAG_NAMES_RULE_7] = { 111 | "address", "article", "aside", "base", "basefont", "blockquote", 112 | "body", "caption", "center", "col", "colgroup", "dd", 113 | "details", "dialog", "dir", "div", "dl", "dt", 114 | "fieldset", "figcaption", "figure", "footer", "form", "frame", 115 | "frameset", "h1", "h2", "h3", "h4", "h5", 116 | "h6", "head", "header", "hr", "html", "iframe", 117 | "legend", "li", "link", "main", "menu", "menuitem", 118 | "nav", "noframes", "ol", "optgroup", "option", "p", 119 | "param", "section", "source", "summary", "table", "tbody", 120 | "td", "tfoot", "th", "thead", "title", "tr", 121 | "track", "ul"}; 122 | 123 | // For explanation of the tokens see grammar.js 124 | static const bool paragraph_interrupt_symbols[] = { 125 | false, // LINE_ENDING, 126 | false, // SOFT_LINE_ENDING, 127 | false, // BLOCK_CLOSE, 128 | false, // BLOCK_CONTINUATION, 129 | true, // BLOCK_QUOTE_START, 130 | false, // INDENTED_CHUNK_START, 131 | true, // ATX_H1_MARKER, 132 | true, // ATX_H2_MARKER, 133 | true, // ATX_H3_MARKER, 134 | true, // ATX_H4_MARKER, 135 | true, // ATX_H5_MARKER, 136 | true, // ATX_H6_MARKER, 137 | true, // SETEXT_H1_UNDERLINE, 138 | true, // SETEXT_H2_UNDERLINE, 139 | true, // THEMATIC_BREAK, 140 | true, // LIST_MARKER_MINUS, 141 | true, // LIST_MARKER_PLUS, 142 | true, // LIST_MARKER_STAR, 143 | true, // LIST_MARKER_PARENTHESIS, 144 | true, // LIST_MARKER_DOT, 145 | false, // LIST_MARKER_MINUS_DONT_INTERRUPT, 146 | false, // LIST_MARKER_PLUS_DONT_INTERRUPT, 147 | false, // LIST_MARKER_STAR_DONT_INTERRUPT, 148 | false, // LIST_MARKER_PARENTHESIS_DONT_INTERRUPT, 149 | false, // LIST_MARKER_DOT_DONT_INTERRUPT, 150 | true, // FENCED_CODE_BLOCK_START_BACKTICK, 151 | true, // FENCED_CODE_BLOCK_START_TILDE, 152 | true, // BLANK_LINE_START, 153 | false, // FENCED_CODE_BLOCK_END_BACKTICK, 154 | false, // FENCED_CODE_BLOCK_END_TILDE, 155 | true, // HTML_BLOCK_1_START, 156 | false, // HTML_BLOCK_1_END, 157 | true, // HTML_BLOCK_2_START, 158 | true, // HTML_BLOCK_3_START, 159 | true, // HTML_BLOCK_4_START, 160 | true, // HTML_BLOCK_5_START, 161 | true, // HTML_BLOCK_6_START, 162 | false, // HTML_BLOCK_7_START, 163 | false, // CLOSE_BLOCK, 164 | false, // NO_INDENTED_CHUNK, 165 | false, // ERROR, 166 | false, // TRIGGER_ERROR, 167 | false, // EOF, 168 | false, // MINUS_METADATA, 169 | false, // PLUS_METADATA, 170 | true, // PIPE_TABLE_START, 171 | false, // PIPE_TABLE_LINE_ENDING, 172 | }; 173 | 174 | // State bitflags used with `Scanner.state` 175 | 176 | // Currently matching (at the beginning of a line) 177 | static const uint8_t STATE_MATCHING = 0x1 << 0; 178 | // Last line break was inside a paragraph 179 | static const uint8_t STATE_WAS_SOFT_LINE_BREAK = 0x1 << 1; 180 | // Block should be closed after next line break 181 | static const uint8_t STATE_CLOSE_BLOCK = 0x1 << 4; 182 | 183 | static size_t roundup_32(size_t x) { 184 | x--; 185 | 186 | x |= x >> 1; 187 | x |= x >> 2; 188 | x |= x >> 4; 189 | x |= x >> 8; 190 | x |= x >> 16; 191 | 192 | x++; 193 | 194 | return x; 195 | } 196 | 197 | typedef struct { 198 | // A stack of open blocks in the current parse state 199 | struct { 200 | size_t size; 201 | size_t capacity; 202 | Block *items; 203 | } open_blocks; 204 | 205 | // Parser state flags 206 | uint8_t state; 207 | // Number of blocks that have been matched so far. Only changes during 208 | // matching and is reset after every line ending 209 | uint8_t matched; 210 | // Consumed but "unused" indentation. Sometimes a tab needs to be "split" to 211 | // be used in multiple tokens. 212 | uint8_t indentation; 213 | // The current column. Used to decide how many spaces a tab should equal 214 | uint8_t column; 215 | // The delimiter length of the currently open fenced code block 216 | uint8_t fenced_code_block_delimiter_length; 217 | 218 | bool simulate; 219 | } Scanner; 220 | 221 | static void push_block(Scanner *s, Block b) { 222 | if (s->open_blocks.size == s->open_blocks.capacity) { 223 | s->open_blocks.capacity = 224 | s->open_blocks.capacity ? s->open_blocks.capacity << 1 : 8; 225 | void *tmp = realloc(s->open_blocks.items, 226 | sizeof(Block) * s->open_blocks.capacity); 227 | assert(tmp != NULL); 228 | s->open_blocks.items = tmp; 229 | } 230 | 231 | s->open_blocks.items[s->open_blocks.size++] = b; 232 | } 233 | 234 | static inline Block pop_block(Scanner *s) { 235 | return s->open_blocks.items[--s->open_blocks.size]; 236 | } 237 | 238 | // Write the whole state of a Scanner to a byte buffer 239 | static unsigned serialize(Scanner *s, char *buffer) { 240 | unsigned size = 0; 241 | buffer[size++] = (char)s->state; 242 | buffer[size++] = (char)s->matched; 243 | buffer[size++] = (char)s->indentation; 244 | buffer[size++] = (char)s->column; 245 | buffer[size++] = (char)s->fenced_code_block_delimiter_length; 246 | size_t blocks_count = s->open_blocks.size; 247 | if (blocks_count > 0) { 248 | memcpy(&buffer[size], s->open_blocks.items, 249 | blocks_count * sizeof(Block)); 250 | size += blocks_count * sizeof(Block); 251 | } 252 | return size; 253 | } 254 | 255 | // Read the whole state of a Scanner from a byte buffer 256 | // `serizalize` and `deserialize` should be fully symmetric. 257 | static void deserialize(Scanner *s, const char *buffer, unsigned length) { 258 | s->open_blocks.size = 0; 259 | s->open_blocks.capacity = 0; 260 | s->state = 0; 261 | s->matched = 0; 262 | s->indentation = 0; 263 | s->column = 0; 264 | s->fenced_code_block_delimiter_length = 0; 265 | if (length > 0) { 266 | size_t size = 0; 267 | s->state = (uint8_t)buffer[size++]; 268 | s->matched = (uint8_t)buffer[size++]; 269 | s->indentation = (uint8_t)buffer[size++]; 270 | s->column = (uint8_t)buffer[size++]; 271 | s->fenced_code_block_delimiter_length = (uint8_t)buffer[size++]; 272 | size_t blocks_size = length - size; 273 | if (blocks_size > 0) { 274 | size_t blocks_count = blocks_size / sizeof(Block); 275 | 276 | // ensure open blocks has enough room 277 | if (s->open_blocks.capacity < blocks_count) { 278 | size_t capacity = roundup_32(blocks_count); 279 | void *tmp = realloc(s->open_blocks.items, 280 | sizeof(Block) * capacity); 281 | assert(tmp != NULL); 282 | s->open_blocks.items = tmp; 283 | s->open_blocks.capacity = capacity; 284 | } 285 | memcpy(s->open_blocks.items, &buffer[size], blocks_size); 286 | s->open_blocks.size = blocks_count; 287 | } 288 | } 289 | } 290 | 291 | static void mark_end(Scanner *s, TSLexer *lexer) { 292 | if (!s->simulate) { 293 | lexer->mark_end(lexer); 294 | } 295 | } 296 | 297 | // Convenience function to emit the error token. This is done to stop invalid 298 | // parse branches. Specifically: 299 | // 1. When encountering a newline after a line break that ended a paragraph, and 300 | // no new block 301 | // has been opened. 302 | // 2. When encountering a new block after a soft line break. 303 | // 3. When a `$._trigger_error` token is valid, which is used to stop parse 304 | // branches through 305 | // normal tree-sitter grammar rules. 306 | // 307 | // See also the `$._soft_line_break` and `$._paragraph_end_newline` tokens in 308 | // grammar.js 309 | static bool error(TSLexer *lexer) { 310 | lexer->result_symbol = ERROR; 311 | return true; 312 | } 313 | 314 | // Advance the lexer one character 315 | // Also keeps track of the current column, counting tabs as spaces with tab stop 316 | // 4 See https://github.github.com/gfm/#tabs 317 | static size_t advance(Scanner *s, TSLexer *lexer) { 318 | size_t size = 1; 319 | if (lexer->lookahead == '\t') { 320 | size = 4 - s->column; 321 | s->column = 0; 322 | } else { 323 | s->column = (s->column + 1) % 4; 324 | } 325 | lexer->advance(lexer, false); 326 | return size; 327 | } 328 | 329 | // Try to match the given block, i.e. consume all tokens that belong to the 330 | // block. These are 331 | // 1. indentation for list items and indented code blocks 332 | // 2. '>' for block quotes 333 | // Returns true if the block is matched and false otherwise 334 | static bool match(Scanner *s, TSLexer *lexer, Block block) { 335 | switch (block) { 336 | case INDENTED_CODE_BLOCK: 337 | while (s->indentation < 4) { 338 | if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 339 | s->indentation += advance(s, lexer); 340 | } else { 341 | break; 342 | } 343 | } 344 | if (s->indentation >= 4 && lexer->lookahead != '\n' && 345 | lexer->lookahead != '\r') { 346 | s->indentation -= 4; 347 | return true; 348 | } 349 | break; 350 | case LIST_ITEM: 351 | case LIST_ITEM_1_INDENTATION: 352 | case LIST_ITEM_2_INDENTATION: 353 | case LIST_ITEM_3_INDENTATION: 354 | case LIST_ITEM_4_INDENTATION: 355 | case LIST_ITEM_5_INDENTATION: 356 | case LIST_ITEM_6_INDENTATION: 357 | case LIST_ITEM_7_INDENTATION: 358 | case LIST_ITEM_8_INDENTATION: 359 | case LIST_ITEM_9_INDENTATION: 360 | case LIST_ITEM_10_INDENTATION: 361 | case LIST_ITEM_11_INDENTATION: 362 | case LIST_ITEM_12_INDENTATION: 363 | case LIST_ITEM_13_INDENTATION: 364 | case LIST_ITEM_14_INDENTATION: 365 | case LIST_ITEM_MAX_INDENTATION: 366 | while (s->indentation < list_item_indentation(block)) { 367 | if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 368 | s->indentation += advance(s, lexer); 369 | } else { 370 | break; 371 | } 372 | } 373 | if (s->indentation >= list_item_indentation(block)) { 374 | s->indentation -= list_item_indentation(block); 375 | return true; 376 | } 377 | if (lexer->lookahead == '\n' || lexer->lookahead == '\r') { 378 | s->indentation = 0; 379 | return true; 380 | } 381 | break; 382 | case BLOCK_QUOTE: 383 | while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 384 | s->indentation += advance(s, lexer); 385 | } 386 | if (lexer->lookahead == '>') { 387 | advance(s, lexer); 388 | s->indentation = 0; 389 | if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 390 | s->indentation += advance(s, lexer) - 1; 391 | } 392 | return true; 393 | } 394 | break; 395 | case FENCED_CODE_BLOCK: 396 | case ANONYMOUS: 397 | return true; 398 | } 399 | return false; 400 | } 401 | 402 | static bool parse_fenced_code_block(Scanner *s, const char delimiter, 403 | TSLexer *lexer, const bool *valid_symbols) { 404 | // count the number of backticks 405 | uint8_t level = 0; 406 | while (lexer->lookahead == delimiter) { 407 | advance(s, lexer); 408 | level++; 409 | } 410 | mark_end(s, lexer); 411 | // If this is able to close a fenced code block then that is the only valid 412 | // interpretation. It can only close a fenced code block if the number of 413 | // backticks is at least the number of backticks of the opening delimiter. 414 | // Also it cannot be indented more than 3 spaces. 415 | if ((delimiter == '`' ? valid_symbols[FENCED_CODE_BLOCK_END_BACKTICK] 416 | : valid_symbols[FENCED_CODE_BLOCK_END_TILDE]) && 417 | s->indentation < 4 && level >= s->fenced_code_block_delimiter_length) { 418 | while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 419 | advance(s, lexer); 420 | } 421 | if (lexer->lookahead == '\n' || lexer->lookahead == '\r') { 422 | s->fenced_code_block_delimiter_length = 0; 423 | lexer->result_symbol = delimiter == '`' 424 | ? FENCED_CODE_BLOCK_END_BACKTICK 425 | : FENCED_CODE_BLOCK_END_TILDE; 426 | return true; 427 | } 428 | } 429 | // If this could be the start of a fenced code block, check if the info 430 | // string contains any backticks. 431 | if ((delimiter == '`' ? valid_symbols[FENCED_CODE_BLOCK_START_BACKTICK] 432 | : valid_symbols[FENCED_CODE_BLOCK_START_TILDE]) && 433 | level >= 3) { 434 | bool info_string_has_backtick = false; 435 | if (delimiter == '`') { 436 | while (lexer->lookahead != '\n' && lexer->lookahead != '\r' && 437 | !lexer->eof(lexer)) { 438 | if (lexer->lookahead == '`') { 439 | info_string_has_backtick = true; 440 | break; 441 | } 442 | advance(s, lexer); 443 | } 444 | } 445 | // If it does not then choose to interpret this as the start of a fenced 446 | // code block. 447 | if (!info_string_has_backtick) { 448 | lexer->result_symbol = delimiter == '`' 449 | ? FENCED_CODE_BLOCK_START_BACKTICK 450 | : FENCED_CODE_BLOCK_START_TILDE; 451 | if (!s->simulate) 452 | push_block(s, FENCED_CODE_BLOCK); 453 | // Remember the length of the delimiter for later, since we need it 454 | // to decide whether a sequence of backticks can close the block. 455 | s->fenced_code_block_delimiter_length = level; 456 | s->indentation = 0; 457 | return true; 458 | } 459 | } 460 | return false; 461 | } 462 | 463 | static bool parse_star(Scanner *s, TSLexer *lexer, const bool *valid_symbols) { 464 | advance(s, lexer); 465 | mark_end(s, lexer); 466 | // Otherwise count the number of stars permitting whitespaces between them. 467 | size_t star_count = 1; 468 | // Also remember how many stars there are before the first whitespace... 469 | // ...and how many spaces follow the first star. 470 | uint8_t extra_indentation = 0; 471 | for (;;) { 472 | if (lexer->lookahead == '*') { 473 | if (star_count == 1 && extra_indentation >= 1 && 474 | valid_symbols[LIST_MARKER_STAR]) { 475 | // If we get to this point then the token has to be at least 476 | // this long. We need to call `mark_end` here in case we decide 477 | // later that this is a list item. 478 | mark_end(s, lexer); 479 | } 480 | star_count++; 481 | advance(s, lexer); 482 | } else if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 483 | if (star_count == 1) { 484 | extra_indentation += advance(s, lexer); 485 | } else { 486 | advance(s, lexer); 487 | } 488 | } else { 489 | break; 490 | } 491 | } 492 | bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r'; 493 | bool dont_interrupt = false; 494 | if (star_count == 1 && line_end) { 495 | extra_indentation = 1; 496 | // line is empty so don't interrupt paragraphs if this is a list marker 497 | dont_interrupt = s->matched == s->open_blocks.size; 498 | } 499 | // If there were at least 3 stars then this could be a thematic break 500 | bool thematic_break = star_count >= 3 && line_end; 501 | // If there was a star and at least one space after that star then this 502 | // could be a list marker. 503 | bool list_marker_star = star_count >= 1 && extra_indentation >= 1; 504 | if (valid_symbols[THEMATIC_BREAK] && thematic_break && s->indentation < 4) { 505 | // If a thematic break is valid then it takes precedence 506 | lexer->result_symbol = THEMATIC_BREAK; 507 | mark_end(s, lexer); 508 | s->indentation = 0; 509 | return true; 510 | } 511 | if ((dont_interrupt ? valid_symbols[LIST_MARKER_STAR_DONT_INTERRUPT] 512 | : valid_symbols[LIST_MARKER_STAR]) && 513 | list_marker_star) { 514 | // List markers take precedence over emphasis markers 515 | // If star_count > 1 then we already called mark_end at the right point. 516 | // Otherwise the token should go until this point. 517 | if (star_count == 1) { 518 | mark_end(s, lexer); 519 | } 520 | // Not counting one space... 521 | extra_indentation--; 522 | // ... check if the list item begins with an indented code block 523 | if (extra_indentation <= 3) { 524 | // If not then calculate the indentation level of the list item 525 | // content as indentation of list marker + indentation after list 526 | // marker - 1 527 | extra_indentation += s->indentation; 528 | s->indentation = 0; 529 | } else { 530 | // Otherwise the indentation level is just the indentation of the 531 | // list marker. We keep the indentation after the list marker for 532 | // later blocks. 533 | uint8_t temp = s->indentation; 534 | s->indentation = extra_indentation; 535 | extra_indentation = temp; 536 | } 537 | if (!s->simulate) 538 | push_block(s, (Block)(LIST_ITEM + extra_indentation)); 539 | lexer->result_symbol = 540 | dont_interrupt ? LIST_MARKER_STAR_DONT_INTERRUPT : LIST_MARKER_STAR; 541 | return true; 542 | } 543 | return false; 544 | } 545 | 546 | static bool parse_thematic_break_underscore(Scanner *s, TSLexer *lexer, 547 | const bool *valid_symbols) { 548 | advance(s, lexer); 549 | mark_end(s, lexer); 550 | size_t underscore_count = 1; 551 | for (;;) { 552 | if (lexer->lookahead == '_') { 553 | underscore_count++; 554 | advance(s, lexer); 555 | } else if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 556 | advance(s, lexer); 557 | } else { 558 | break; 559 | } 560 | } 561 | bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r'; 562 | if (underscore_count >= 3 && line_end && valid_symbols[THEMATIC_BREAK]) { 563 | lexer->result_symbol = THEMATIC_BREAK; 564 | mark_end(s, lexer); 565 | s->indentation = 0; 566 | return true; 567 | } 568 | return false; 569 | } 570 | 571 | static bool parse_block_quote(Scanner *s, TSLexer *lexer, 572 | const bool *valid_symbols) { 573 | if (valid_symbols[BLOCK_QUOTE_START]) { 574 | advance(s, lexer); 575 | s->indentation = 0; 576 | if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 577 | s->indentation += advance(s, lexer) - 1; 578 | } 579 | lexer->result_symbol = BLOCK_QUOTE_START; 580 | if (!s->simulate) 581 | push_block(s, BLOCK_QUOTE); 582 | return true; 583 | } 584 | return false; 585 | } 586 | 587 | static bool parse_atx_heading(Scanner *s, TSLexer *lexer, 588 | const bool *valid_symbols) { 589 | if (valid_symbols[ATX_H1_MARKER] && s->indentation <= 3) { 590 | mark_end(s, lexer); 591 | uint16_t level = 0; 592 | while (lexer->lookahead == '#' && level <= 6) { 593 | advance(s, lexer); 594 | level++; 595 | } 596 | if (level <= 6 && 597 | (lexer->lookahead == ' ' || lexer->lookahead == '\t' || 598 | lexer->lookahead == '\n' || lexer->lookahead == '\r')) { 599 | lexer->result_symbol = ATX_H1_MARKER + (level - 1); 600 | s->indentation = 0; 601 | mark_end(s, lexer); 602 | return true; 603 | } 604 | } 605 | return false; 606 | } 607 | 608 | static bool parse_setext_underline(Scanner *s, TSLexer *lexer, 609 | const bool *valid_symbols) { 610 | if (valid_symbols[SETEXT_H1_UNDERLINE] && 611 | s->matched == s->open_blocks.size) { 612 | mark_end(s, lexer); 613 | while (lexer->lookahead == '=') { 614 | advance(s, lexer); 615 | } 616 | while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 617 | advance(s, lexer); 618 | } 619 | if (lexer->lookahead == '\n' || lexer->lookahead == '\r') { 620 | lexer->result_symbol = SETEXT_H1_UNDERLINE; 621 | mark_end(s, lexer); 622 | return true; 623 | } 624 | } 625 | return false; 626 | } 627 | 628 | static bool parse_plus(Scanner *s, TSLexer *lexer, const bool *valid_symbols) { 629 | if (s->indentation <= 3 && 630 | (valid_symbols[LIST_MARKER_PLUS] || 631 | valid_symbols[LIST_MARKER_PLUS_DONT_INTERRUPT] || 632 | valid_symbols[PLUS_METADATA])) { 633 | advance(s, lexer); 634 | if (valid_symbols[PLUS_METADATA] && lexer->lookahead == '+') { 635 | advance(s, lexer); 636 | if (lexer->lookahead != '+') { 637 | return false; 638 | } 639 | advance(s, lexer); 640 | while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 641 | advance(s, lexer); 642 | } 643 | if (lexer->lookahead != '\n' && lexer->lookahead != '\r') { 644 | return false; 645 | } 646 | for (;;) { 647 | // advance over newline 648 | if (lexer->lookahead == '\r') { 649 | advance(s, lexer); 650 | if (lexer->lookahead == '\n') { 651 | advance(s, lexer); 652 | } 653 | } else { 654 | advance(s, lexer); 655 | } 656 | // check for pluses 657 | size_t plus_count = 0; 658 | while (lexer->lookahead == '+') { 659 | plus_count++; 660 | advance(s, lexer); 661 | } 662 | if (plus_count == 3) { 663 | // if exactly 3 check if next symbol (after eventual 664 | // whitespace) is newline 665 | while (lexer->lookahead == ' ' || 666 | lexer->lookahead == '\t') { 667 | advance(s, lexer); 668 | } 669 | if (lexer->lookahead == '\r' || lexer->lookahead == '\n') { 670 | // if so also consume newline 671 | if (lexer->lookahead == '\r') { 672 | advance(s, lexer); 673 | if (lexer->lookahead == '\n') { 674 | advance(s, lexer); 675 | } 676 | } else { 677 | advance(s, lexer); 678 | } 679 | mark_end(s, lexer); 680 | lexer->result_symbol = PLUS_METADATA; 681 | return true; 682 | } 683 | } 684 | // otherwise consume rest of line 685 | while (lexer->lookahead != '\n' && lexer->lookahead != '\r' && 686 | !lexer->eof(lexer)) { 687 | advance(s, lexer); 688 | } 689 | // if end of file is reached, then this is not metadata 690 | if (lexer->eof(lexer)) { 691 | break; 692 | } 693 | } 694 | } else { 695 | uint8_t extra_indentation = 0; 696 | while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 697 | extra_indentation += advance(s, lexer); 698 | } 699 | bool dont_interrupt = false; 700 | if (lexer->lookahead == '\r' || lexer->lookahead == '\n') { 701 | extra_indentation = 1; 702 | dont_interrupt = true; 703 | } 704 | dont_interrupt = 705 | dont_interrupt && s->matched == s->open_blocks.size; 706 | if (extra_indentation >= 1 && 707 | (dont_interrupt ? valid_symbols[LIST_MARKER_PLUS_DONT_INTERRUPT] 708 | : valid_symbols[LIST_MARKER_PLUS])) { 709 | lexer->result_symbol = dont_interrupt 710 | ? LIST_MARKER_PLUS_DONT_INTERRUPT 711 | : LIST_MARKER_PLUS; 712 | extra_indentation--; 713 | if (extra_indentation <= 3) { 714 | extra_indentation += s->indentation; 715 | s->indentation = 0; 716 | } else { 717 | uint8_t temp = s->indentation; 718 | s->indentation = extra_indentation; 719 | extra_indentation = temp; 720 | } 721 | if (!s->simulate) 722 | push_block(s, (Block)(LIST_ITEM + extra_indentation)); 723 | return true; 724 | } 725 | } 726 | } 727 | return false; 728 | } 729 | 730 | static bool parse_ordered_list_marker(Scanner *s, TSLexer *lexer, 731 | const bool *valid_symbols) { 732 | if (s->indentation <= 3 && 733 | (valid_symbols[LIST_MARKER_PARENTHESIS] || 734 | valid_symbols[LIST_MARKER_DOT] || 735 | valid_symbols[LIST_MARKER_PARENTHESIS_DONT_INTERRUPT] || 736 | valid_symbols[LIST_MARKER_DOT_DONT_INTERRUPT])) { 737 | size_t digits = 1; 738 | bool dont_interrupt = lexer->lookahead != '1'; 739 | advance(s, lexer); 740 | while (isdigit(lexer->lookahead)) { 741 | dont_interrupt = true; 742 | digits++; 743 | advance(s, lexer); 744 | } 745 | if (digits >= 1 && digits <= 9) { 746 | bool dot = false; 747 | bool parenthesis = false; 748 | if (lexer->lookahead == '.') { 749 | advance(s, lexer); 750 | dot = true; 751 | } else if (lexer->lookahead == ')') { 752 | advance(s, lexer); 753 | parenthesis = true; 754 | } 755 | if (dot || parenthesis) { 756 | uint8_t extra_indentation = 0; 757 | while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 758 | extra_indentation += advance(s, lexer); 759 | } 760 | bool line_end = 761 | lexer->lookahead == '\n' || lexer->lookahead == '\r'; 762 | if (line_end) { 763 | extra_indentation = 1; 764 | dont_interrupt = true; 765 | } 766 | dont_interrupt = 767 | dont_interrupt && s->matched == s->open_blocks.size; 768 | if (extra_indentation >= 1 && 769 | (dot ? (dont_interrupt 770 | ? valid_symbols[LIST_MARKER_DOT_DONT_INTERRUPT] 771 | : valid_symbols[LIST_MARKER_DOT]) 772 | : (dont_interrupt 773 | ? valid_symbols 774 | [LIST_MARKER_PARENTHESIS_DONT_INTERRUPT] 775 | : valid_symbols[LIST_MARKER_PARENTHESIS]))) { 776 | lexer->result_symbol = 777 | dot ? LIST_MARKER_DOT : LIST_MARKER_PARENTHESIS; 778 | extra_indentation--; 779 | if (extra_indentation <= 3) { 780 | extra_indentation += s->indentation; 781 | s->indentation = 0; 782 | } else { 783 | uint8_t temp = s->indentation; 784 | s->indentation = extra_indentation; 785 | extra_indentation = temp; 786 | } 787 | if (!s->simulate) 788 | push_block( 789 | s, (Block)(LIST_ITEM + extra_indentation + digits)); 790 | return true; 791 | } 792 | } 793 | } 794 | } 795 | return false; 796 | } 797 | 798 | static bool parse_minus(Scanner *s, TSLexer *lexer, const bool *valid_symbols) { 799 | if (s->indentation <= 3 && 800 | (valid_symbols[LIST_MARKER_MINUS] || 801 | valid_symbols[LIST_MARKER_MINUS_DONT_INTERRUPT] || 802 | valid_symbols[SETEXT_H2_UNDERLINE] || valid_symbols[THEMATIC_BREAK] || 803 | valid_symbols[MINUS_METADATA])) { 804 | mark_end(s, lexer); 805 | bool whitespace_after_minus = false; 806 | bool minus_after_whitespace = false; 807 | size_t minus_count = 0; 808 | uint8_t extra_indentation = 0; 809 | 810 | for (;;) { 811 | if (lexer->lookahead == '-') { 812 | if (minus_count == 1 && extra_indentation >= 1) { 813 | mark_end(s, lexer); 814 | } 815 | minus_count++; 816 | advance(s, lexer); 817 | minus_after_whitespace = whitespace_after_minus; 818 | } else if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 819 | if (minus_count == 1) { 820 | extra_indentation += advance(s, lexer); 821 | } else { 822 | advance(s, lexer); 823 | } 824 | whitespace_after_minus = true; 825 | } else { 826 | break; 827 | } 828 | } 829 | bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r'; 830 | bool dont_interrupt = false; 831 | if (minus_count == 1 && line_end) { 832 | extra_indentation = 1; 833 | dont_interrupt = true; 834 | } 835 | dont_interrupt = dont_interrupt && s->matched == s->open_blocks.size; 836 | bool thematic_break = minus_count >= 3 && line_end; 837 | bool underline = 838 | minus_count >= 1 && !minus_after_whitespace && line_end && 839 | s->matched == 840 | s->open_blocks 841 | .size; // setext heading can not break lazy continuation 842 | bool list_marker_minus = minus_count >= 1 && extra_indentation >= 1; 843 | bool success = false; 844 | if (valid_symbols[SETEXT_H2_UNDERLINE] && underline) { 845 | lexer->result_symbol = SETEXT_H2_UNDERLINE; 846 | mark_end(s, lexer); 847 | s->indentation = 0; 848 | success = true; 849 | } else if (valid_symbols[THEMATIC_BREAK] && 850 | thematic_break) { // underline is false if list_marker_minus 851 | // is true 852 | lexer->result_symbol = THEMATIC_BREAK; 853 | mark_end(s, lexer); 854 | s->indentation = 0; 855 | success = true; 856 | } else if ((dont_interrupt 857 | ? valid_symbols[LIST_MARKER_MINUS_DONT_INTERRUPT] 858 | : valid_symbols[LIST_MARKER_MINUS]) && 859 | list_marker_minus) { 860 | if (minus_count == 1) { 861 | mark_end(s, lexer); 862 | } 863 | extra_indentation--; 864 | if (extra_indentation <= 3) { 865 | extra_indentation += s->indentation; 866 | s->indentation = 0; 867 | } else { 868 | uint8_t temp = s->indentation; 869 | s->indentation = extra_indentation; 870 | extra_indentation = temp; 871 | } 872 | if (!s->simulate) 873 | push_block(s, (Block)(LIST_ITEM + extra_indentation)); 874 | lexer->result_symbol = dont_interrupt 875 | ? LIST_MARKER_MINUS_DONT_INTERRUPT 876 | : LIST_MARKER_MINUS; 877 | return true; 878 | } 879 | if (minus_count == 3 && (!minus_after_whitespace) && line_end && 880 | valid_symbols[MINUS_METADATA]) { 881 | for (;;) { 882 | // advance over newline 883 | if (lexer->lookahead == '\r') { 884 | advance(s, lexer); 885 | if (lexer->lookahead == '\n') { 886 | advance(s, lexer); 887 | } 888 | } else { 889 | advance(s, lexer); 890 | } 891 | // check for minuses 892 | minus_count = 0; 893 | while (lexer->lookahead == '-') { 894 | minus_count++; 895 | advance(s, lexer); 896 | } 897 | if (minus_count == 3) { 898 | // if exactly 3 check if next symbol (after eventual 899 | // whitespace) is newline 900 | while (lexer->lookahead == ' ' || 901 | lexer->lookahead == '\t') { 902 | advance(s, lexer); 903 | } 904 | if (lexer->lookahead == '\r' || lexer->lookahead == '\n') { 905 | // if so also consume newline 906 | if (lexer->lookahead == '\r') { 907 | advance(s, lexer); 908 | if (lexer->lookahead == '\n') { 909 | advance(s, lexer); 910 | } 911 | } else { 912 | advance(s, lexer); 913 | } 914 | mark_end(s, lexer); 915 | lexer->result_symbol = MINUS_METADATA; 916 | return true; 917 | } 918 | } 919 | // otherwise consume rest of line 920 | while (lexer->lookahead != '\n' && lexer->lookahead != '\r' && 921 | !lexer->eof(lexer)) { 922 | advance(s, lexer); 923 | } 924 | // if end of file is reached, then this is not metadata 925 | if (lexer->eof(lexer)) { 926 | break; 927 | } 928 | } 929 | } 930 | if (success) { 931 | return true; 932 | } 933 | } 934 | return false; 935 | } 936 | 937 | static bool parse_html_block(Scanner *s, TSLexer *lexer, 938 | const bool *valid_symbols) { 939 | if (!(valid_symbols[HTML_BLOCK_1_START] || 940 | valid_symbols[HTML_BLOCK_1_END] || 941 | valid_symbols[HTML_BLOCK_2_START] || 942 | valid_symbols[HTML_BLOCK_3_START] || 943 | valid_symbols[HTML_BLOCK_4_START] || 944 | valid_symbols[HTML_BLOCK_5_START] || 945 | valid_symbols[HTML_BLOCK_6_START] || 946 | valid_symbols[HTML_BLOCK_7_START])) { 947 | return false; 948 | } 949 | advance(s, lexer); 950 | if (lexer->lookahead == '?' && valid_symbols[HTML_BLOCK_3_START]) { 951 | advance(s, lexer); 952 | lexer->result_symbol = HTML_BLOCK_3_START; 953 | if (!s->simulate) 954 | push_block(s, ANONYMOUS); 955 | return true; 956 | } 957 | if (lexer->lookahead == '!') { 958 | // could be block 2 959 | advance(s, lexer); 960 | if (lexer->lookahead == '-') { 961 | advance(s, lexer); 962 | if (lexer->lookahead == '-' && valid_symbols[HTML_BLOCK_2_START]) { 963 | advance(s, lexer); 964 | lexer->result_symbol = HTML_BLOCK_2_START; 965 | if (!s->simulate) 966 | push_block(s, ANONYMOUS); 967 | return true; 968 | } 969 | } else if ('A' <= lexer->lookahead && lexer->lookahead <= 'Z' && 970 | valid_symbols[HTML_BLOCK_4_START]) { 971 | advance(s, lexer); 972 | lexer->result_symbol = HTML_BLOCK_4_START; 973 | if (!s->simulate) 974 | push_block(s, ANONYMOUS); 975 | return true; 976 | } else if (lexer->lookahead == '[') { 977 | advance(s, lexer); 978 | if (lexer->lookahead == 'C') { 979 | advance(s, lexer); 980 | if (lexer->lookahead == 'D') { 981 | advance(s, lexer); 982 | if (lexer->lookahead == 'A') { 983 | advance(s, lexer); 984 | if (lexer->lookahead == 'T') { 985 | advance(s, lexer); 986 | if (lexer->lookahead == 'A') { 987 | advance(s, lexer); 988 | if (lexer->lookahead == '[' && 989 | valid_symbols[HTML_BLOCK_5_START]) { 990 | advance(s, lexer); 991 | lexer->result_symbol = HTML_BLOCK_5_START; 992 | if (!s->simulate) 993 | push_block(s, ANONYMOUS); 994 | return true; 995 | } 996 | } 997 | } 998 | } 999 | } 1000 | } 1001 | } 1002 | } 1003 | bool starting_slash = lexer->lookahead == '/'; 1004 | if (starting_slash) { 1005 | advance(s, lexer); 1006 | } 1007 | char name[11]; 1008 | size_t name_length = 0; 1009 | while (iswalpha((wint_t)lexer->lookahead)) { 1010 | if (name_length < 10) { 1011 | name[name_length++] = (char)towlower((wint_t)lexer->lookahead); 1012 | } else { 1013 | name_length = 12; 1014 | } 1015 | advance(s, lexer); 1016 | } 1017 | if (name_length == 0) { 1018 | return false; 1019 | } 1020 | bool tag_closed = false; 1021 | if (name_length < 11) { 1022 | name[name_length] = 0; 1023 | bool next_symbol_valid = 1024 | lexer->lookahead == ' ' || lexer->lookahead == '\t' || 1025 | lexer->lookahead == '\n' || lexer->lookahead == '\r' || 1026 | lexer->lookahead == '>'; 1027 | if (next_symbol_valid) { 1028 | // try block 1 names 1029 | for (size_t i = 0; i < NUM_HTML_TAG_NAMES_RULE_1; i++) { 1030 | if (strcmp(name, HTML_TAG_NAMES_RULE_1[i]) == 0) { 1031 | if (starting_slash) { 1032 | if (valid_symbols[HTML_BLOCK_1_END]) { 1033 | lexer->result_symbol = HTML_BLOCK_1_END; 1034 | return true; 1035 | } 1036 | } else if (valid_symbols[HTML_BLOCK_1_START]) { 1037 | lexer->result_symbol = HTML_BLOCK_1_START; 1038 | if (!s->simulate) 1039 | push_block(s, ANONYMOUS); 1040 | return true; 1041 | } 1042 | } 1043 | } 1044 | } 1045 | if (!next_symbol_valid && lexer->lookahead == '/') { 1046 | advance(s, lexer); 1047 | if (lexer->lookahead == '>') { 1048 | advance(s, lexer); 1049 | tag_closed = true; 1050 | } 1051 | } 1052 | if (next_symbol_valid || tag_closed) { 1053 | // try block 2 names 1054 | for (size_t i = 0; i < NUM_HTML_TAG_NAMES_RULE_7; i++) { 1055 | if (strcmp(name, HTML_TAG_NAMES_RULE_7[i]) == 0 && 1056 | valid_symbols[HTML_BLOCK_6_START]) { 1057 | lexer->result_symbol = HTML_BLOCK_6_START; 1058 | if (!s->simulate) 1059 | push_block(s, ANONYMOUS); 1060 | return true; 1061 | } 1062 | } 1063 | } 1064 | } 1065 | 1066 | if (!valid_symbols[HTML_BLOCK_7_START]) { 1067 | return false; 1068 | } 1069 | 1070 | if (!tag_closed) { 1071 | // tag name (continued) 1072 | while (iswalnum((wint_t)lexer->lookahead) || lexer->lookahead == '-') { 1073 | advance(s, lexer); 1074 | } 1075 | if (!starting_slash) { 1076 | // attributes 1077 | bool had_whitespace = false; 1078 | for (;;) { 1079 | // whitespace 1080 | while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 1081 | had_whitespace = true; 1082 | advance(s, lexer); 1083 | } 1084 | if (lexer->lookahead == '/') { 1085 | advance(s, lexer); 1086 | break; 1087 | } 1088 | if (lexer->lookahead == '>') { 1089 | break; 1090 | } 1091 | // attribute name 1092 | if (!had_whitespace) { 1093 | return false; 1094 | } 1095 | if (!iswalpha((wint_t)lexer->lookahead) && 1096 | lexer->lookahead != '_' && lexer->lookahead != ':') { 1097 | return false; 1098 | } 1099 | had_whitespace = false; 1100 | advance(s, lexer); 1101 | while (iswalnum((wint_t)lexer->lookahead) || 1102 | lexer->lookahead == '_' || lexer->lookahead == '.' || 1103 | lexer->lookahead == ':' || lexer->lookahead == '-') { 1104 | advance(s, lexer); 1105 | } 1106 | // attribute value specification 1107 | // optional whitespace 1108 | while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 1109 | had_whitespace = true; 1110 | advance(s, lexer); 1111 | } 1112 | // = 1113 | if (lexer->lookahead == '=') { 1114 | advance(s, lexer); 1115 | had_whitespace = false; 1116 | // optional whitespace 1117 | while (lexer->lookahead == ' ' || 1118 | lexer->lookahead == '\t') { 1119 | advance(s, lexer); 1120 | } 1121 | // attribute value 1122 | if (lexer->lookahead == '\'' || lexer->lookahead == '"') { 1123 | char delimiter = (char)lexer->lookahead; 1124 | advance(s, lexer); 1125 | while (lexer->lookahead != delimiter && 1126 | lexer->lookahead != '\n' && 1127 | lexer->lookahead != '\r' && !lexer->eof(lexer)) { 1128 | advance(s, lexer); 1129 | } 1130 | if (lexer->lookahead != delimiter) { 1131 | return false; 1132 | } 1133 | advance(s, lexer); 1134 | } else { 1135 | // unquoted attribute value 1136 | bool had_one = false; 1137 | while (lexer->lookahead != ' ' && 1138 | lexer->lookahead != '\t' && 1139 | lexer->lookahead != '"' && 1140 | lexer->lookahead != '\'' && 1141 | lexer->lookahead != '=' && 1142 | lexer->lookahead != '<' && 1143 | lexer->lookahead != '>' && 1144 | lexer->lookahead != '`' && 1145 | lexer->lookahead != '\n' && 1146 | lexer->lookahead != '\r' && !lexer->eof(lexer)) { 1147 | advance(s, lexer); 1148 | had_one = true; 1149 | } 1150 | if (!had_one) { 1151 | return false; 1152 | } 1153 | } 1154 | } 1155 | } 1156 | } else { 1157 | while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 1158 | advance(s, lexer); 1159 | } 1160 | } 1161 | if (lexer->lookahead != '>') { 1162 | return false; 1163 | } 1164 | advance(s, lexer); 1165 | } 1166 | while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 1167 | advance(s, lexer); 1168 | } 1169 | if (lexer->lookahead == '\r' || lexer->lookahead == '\n') { 1170 | lexer->result_symbol = HTML_BLOCK_7_START; 1171 | if (!s->simulate) 1172 | push_block(s, ANONYMOUS); 1173 | return true; 1174 | } 1175 | return false; 1176 | } 1177 | 1178 | static bool parse_pipe_table(Scanner *s, TSLexer *lexer, 1179 | const bool *valid_symbols) { 1180 | 1181 | // unused 1182 | (void)(valid_symbols); 1183 | 1184 | // PIPE_TABLE_START is zero width 1185 | mark_end(s, lexer); 1186 | // count number of cells 1187 | size_t cell_count = 0; 1188 | // also remember if we see starting and ending pipes, as empty headers have 1189 | // to have both 1190 | bool starting_pipe = false; 1191 | bool ending_pipe = false; 1192 | bool empty = true; 1193 | if (lexer->lookahead == '|') { 1194 | starting_pipe = true; 1195 | advance(s, lexer); 1196 | } 1197 | while (lexer->lookahead != '\r' && lexer->lookahead != '\n' && 1198 | !lexer->eof(lexer)) { 1199 | if (lexer->lookahead == '|') { 1200 | cell_count++; 1201 | ending_pipe = true; 1202 | advance(s, lexer); 1203 | } else { 1204 | if (lexer->lookahead != ' ' && lexer->lookahead != '\t') { 1205 | ending_pipe = false; 1206 | } 1207 | if (lexer->lookahead == '\\') { 1208 | advance(s, lexer); 1209 | if (is_punctuation((char)lexer->lookahead)) { 1210 | advance(s, lexer); 1211 | } 1212 | } else { 1213 | advance(s, lexer); 1214 | } 1215 | } 1216 | } 1217 | if (empty && cell_count == 0 && !(starting_pipe && ending_pipe)) { 1218 | return false; 1219 | } 1220 | if (!ending_pipe) { 1221 | cell_count++; 1222 | } 1223 | 1224 | // check the following line for a delimiter row 1225 | // parse a newline 1226 | if (lexer->lookahead == '\n') { 1227 | advance(s, lexer); 1228 | } else if (lexer->lookahead == '\r') { 1229 | advance(s, lexer); 1230 | if (lexer->lookahead == '\n') { 1231 | advance(s, lexer); 1232 | } 1233 | } else { 1234 | return false; 1235 | } 1236 | s->indentation = 0; 1237 | s->column = 0; 1238 | for (;;) { 1239 | if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 1240 | s->indentation += advance(s, lexer); 1241 | } else { 1242 | break; 1243 | } 1244 | } 1245 | s->simulate = true; 1246 | uint8_t matched_temp = 0; 1247 | while (matched_temp < (uint8_t)s->open_blocks.size) { 1248 | if (match(s, lexer, s->open_blocks.items[matched_temp])) { 1249 | matched_temp++; 1250 | } else { 1251 | return false; 1252 | } 1253 | } 1254 | 1255 | // check if delimiter row has the same number of cells and at least one pipe 1256 | size_t delimiter_cell_count = 0; 1257 | if (lexer->lookahead == '|') { 1258 | advance(s, lexer); 1259 | } 1260 | for (;;) { 1261 | while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 1262 | advance(s, lexer); 1263 | } 1264 | if (lexer->lookahead == '|') { 1265 | delimiter_cell_count++; 1266 | advance(s, lexer); 1267 | continue; 1268 | } 1269 | if (lexer->lookahead == ':') { 1270 | advance(s, lexer); 1271 | if (lexer->lookahead != '-') { 1272 | return false; 1273 | } 1274 | } 1275 | bool had_one_minus = false; 1276 | while (lexer->lookahead == '-') { 1277 | had_one_minus = true; 1278 | advance(s, lexer); 1279 | } 1280 | if (had_one_minus) { 1281 | delimiter_cell_count++; 1282 | } 1283 | if (lexer->lookahead == ':') { 1284 | if (!had_one_minus) { 1285 | return false; 1286 | } 1287 | advance(s, lexer); 1288 | } 1289 | while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 1290 | advance(s, lexer); 1291 | } 1292 | if (lexer->lookahead == '|') { 1293 | if (!had_one_minus) { 1294 | delimiter_cell_count++; 1295 | } 1296 | advance(s, lexer); 1297 | continue; 1298 | } 1299 | if (lexer->lookahead != '\r' && lexer->lookahead != '\n') { 1300 | return false; 1301 | } else { 1302 | break; 1303 | } 1304 | } 1305 | // if the cell counts are not equal then this is not a table 1306 | if (cell_count != delimiter_cell_count) { 1307 | return false; 1308 | } 1309 | 1310 | lexer->result_symbol = PIPE_TABLE_START; 1311 | return true; 1312 | } 1313 | 1314 | static bool scan(Scanner *s, TSLexer *lexer, const bool *valid_symbols) { 1315 | // A normal tree-sitter rule decided that the current branch is invalid and 1316 | // now "requests" an error to stop the branch 1317 | if (valid_symbols[TRIGGER_ERROR]) { 1318 | return error(lexer); 1319 | } 1320 | 1321 | // Close the inner most block after the next line break as requested. See 1322 | // `$._close_block` in grammar.js 1323 | if (valid_symbols[CLOSE_BLOCK]) { 1324 | s->state |= STATE_CLOSE_BLOCK; 1325 | lexer->result_symbol = CLOSE_BLOCK; 1326 | return true; 1327 | } 1328 | 1329 | // if we are at the end of the file and there are still open blocks close 1330 | // them all 1331 | if (lexer->eof(lexer)) { 1332 | if (valid_symbols[TOKEN_EOF]) { 1333 | lexer->result_symbol = TOKEN_EOF; 1334 | return true; 1335 | } 1336 | if (s->open_blocks.size > 0) { 1337 | lexer->result_symbol = BLOCK_CLOSE; 1338 | if (!s->simulate) 1339 | pop_block(s); 1340 | return true; 1341 | } 1342 | return false; 1343 | } 1344 | 1345 | if (!(s->state & STATE_MATCHING)) { 1346 | // Parse any preceeding whitespace and remember its length. This makes a 1347 | // lot of parsing quite a bit easier. 1348 | for (;;) { 1349 | if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 1350 | s->indentation += advance(s, lexer); 1351 | } else { 1352 | break; 1353 | } 1354 | } 1355 | // We are not matching. This is where the parsing logic for most 1356 | // "normal" token is. Most importantly parsing logic for the start of 1357 | // new blocks. 1358 | if (valid_symbols[INDENTED_CHUNK_START] && 1359 | !valid_symbols[NO_INDENTED_CHUNK]) { 1360 | if (s->indentation >= 4 && lexer->lookahead != '\n' && 1361 | lexer->lookahead != '\r') { 1362 | lexer->result_symbol = INDENTED_CHUNK_START; 1363 | if (!s->simulate) 1364 | push_block(s, INDENTED_CODE_BLOCK); 1365 | s->indentation -= 4; 1366 | return true; 1367 | } 1368 | } 1369 | // Decide which tokens to consider based on the first non-whitespace 1370 | // character 1371 | switch (lexer->lookahead) { 1372 | case '\r': 1373 | case '\n': 1374 | if (valid_symbols[BLANK_LINE_START]) { 1375 | // A blank line token is actually just 0 width, so do not 1376 | // consume the characters 1377 | lexer->result_symbol = BLANK_LINE_START; 1378 | return true; 1379 | } 1380 | break; 1381 | case '`': 1382 | // A backtick could mark the beginning or ending of a fenced 1383 | // code block. 1384 | return parse_fenced_code_block(s, '`', lexer, valid_symbols); 1385 | case '~': 1386 | // A tilde could mark the beginning or ending of a fenced code 1387 | // block. 1388 | return parse_fenced_code_block(s, '~', lexer, valid_symbols); 1389 | case '*': 1390 | // A star could either mark a list item or a thematic break. 1391 | // This code is similar to the code for '_' and '+'. 1392 | return parse_star(s, lexer, valid_symbols); 1393 | case '_': 1394 | return parse_thematic_break_underscore(s, lexer, valid_symbols); 1395 | case '>': 1396 | // A '>' could mark the beginning of a block quote 1397 | return parse_block_quote(s, lexer, valid_symbols); 1398 | case '#': 1399 | // A '#' could mark a atx heading 1400 | return parse_atx_heading(s, lexer, valid_symbols); 1401 | case '=': 1402 | // A '=' could mark a setext underline 1403 | return parse_setext_underline(s, lexer, valid_symbols); 1404 | case '+': 1405 | // A '+' could be a list marker 1406 | return parse_plus(s, lexer, valid_symbols); 1407 | case '0': 1408 | case '1': 1409 | case '2': 1410 | case '3': 1411 | case '4': 1412 | case '5': 1413 | case '6': 1414 | case '7': 1415 | case '8': 1416 | case '9': 1417 | // A number could be a list marker (if followed by a dot or a 1418 | // parenthesis) 1419 | return parse_ordered_list_marker(s, lexer, valid_symbols); 1420 | case '-': 1421 | // A minus could mark a list marker, a thematic break or a 1422 | // setext underline 1423 | return parse_minus(s, lexer, valid_symbols); 1424 | case '<': 1425 | // A < could mark the beginning of a html block 1426 | return parse_html_block(s, lexer, valid_symbols); 1427 | } 1428 | if (lexer->lookahead != '\r' && lexer->lookahead != '\n' && 1429 | valid_symbols[PIPE_TABLE_START]) { 1430 | return parse_pipe_table(s, lexer, valid_symbols); 1431 | } 1432 | } else { // we are in the state of trying to match all currently open blocks 1433 | bool partial_success = false; 1434 | while (s->matched < (uint8_t)s->open_blocks.size) { 1435 | if (s->matched == (uint8_t)s->open_blocks.size - 1 && 1436 | (s->state & STATE_CLOSE_BLOCK)) { 1437 | if (!partial_success) 1438 | s->state &= ~STATE_CLOSE_BLOCK; 1439 | break; 1440 | } 1441 | if (match(s, lexer, s->open_blocks.items[s->matched])) { 1442 | partial_success = true; 1443 | s->matched++; 1444 | } else { 1445 | if (s->state & STATE_WAS_SOFT_LINE_BREAK) { 1446 | s->state &= (~STATE_MATCHING); 1447 | } 1448 | break; 1449 | } 1450 | } 1451 | if (partial_success) { 1452 | if (s->matched == s->open_blocks.size) { 1453 | s->state &= (~STATE_MATCHING); 1454 | } 1455 | lexer->result_symbol = BLOCK_CONTINUATION; 1456 | return true; 1457 | } 1458 | 1459 | if (!(s->state & STATE_WAS_SOFT_LINE_BREAK)) { 1460 | lexer->result_symbol = BLOCK_CLOSE; 1461 | pop_block(s); 1462 | if (s->matched == s->open_blocks.size) { 1463 | s->state &= (~STATE_MATCHING); 1464 | } 1465 | return true; 1466 | } 1467 | } 1468 | 1469 | // The parser just encountered a line break. Setup the state correspondingly 1470 | if ((valid_symbols[LINE_ENDING] || valid_symbols[SOFT_LINE_ENDING] || 1471 | valid_symbols[PIPE_TABLE_LINE_ENDING]) && 1472 | (lexer->lookahead == '\n' || lexer->lookahead == '\r')) { 1473 | if (lexer->lookahead == '\r') { 1474 | advance(s, lexer); 1475 | if (lexer->lookahead == '\n') { 1476 | advance(s, lexer); 1477 | } 1478 | } else { 1479 | advance(s, lexer); 1480 | } 1481 | s->indentation = 0; 1482 | s->column = 0; 1483 | if (!(s->state & STATE_CLOSE_BLOCK) && 1484 | (valid_symbols[SOFT_LINE_ENDING] || 1485 | valid_symbols[PIPE_TABLE_LINE_ENDING])) { 1486 | lexer->mark_end(lexer); 1487 | for (;;) { 1488 | if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { 1489 | s->indentation += advance(s, lexer); 1490 | } else { 1491 | break; 1492 | } 1493 | } 1494 | s->simulate = true; 1495 | uint8_t matched_temp = s->matched; 1496 | s->matched = 0; 1497 | bool one_will_be_matched = false; 1498 | while (s->matched < (uint8_t)s->open_blocks.size) { 1499 | if (match(s, lexer, s->open_blocks.items[s->matched])) { 1500 | s->matched++; 1501 | one_will_be_matched = true; 1502 | } else { 1503 | break; 1504 | } 1505 | } 1506 | bool all_will_be_matched = s->matched == s->open_blocks.size; 1507 | if (!lexer->eof(lexer) && 1508 | !scan(s, lexer, paragraph_interrupt_symbols)) { 1509 | s->matched = matched_temp; 1510 | // If the last line break ended a paragraph and no new block 1511 | // opened, the last line break should have been a soft line 1512 | // break Reset the counter for matched blocks 1513 | s->matched = 0; 1514 | s->indentation = 0; 1515 | s->column = 0; 1516 | // If there is at least one open block, we should be in the 1517 | // matching state. Also set the matching flag if a 1518 | // `$._soft_line_break_marker` can be emitted so it does get 1519 | // emitted. 1520 | if (one_will_be_matched) { 1521 | s->state |= STATE_MATCHING; 1522 | } else { 1523 | s->state &= (~STATE_MATCHING); 1524 | } 1525 | if (valid_symbols[PIPE_TABLE_LINE_ENDING]) { 1526 | if (all_will_be_matched) { 1527 | lexer->result_symbol = PIPE_TABLE_LINE_ENDING; 1528 | return true; 1529 | } 1530 | } else { 1531 | lexer->result_symbol = SOFT_LINE_ENDING; 1532 | // reset some state variables 1533 | s->state |= STATE_WAS_SOFT_LINE_BREAK; 1534 | return true; 1535 | } 1536 | } else { 1537 | s->matched = matched_temp; 1538 | } 1539 | s->indentation = 0; 1540 | s->column = 0; 1541 | } 1542 | if (valid_symbols[LINE_ENDING]) { 1543 | // If the last line break ended a paragraph and no new block opened, 1544 | // the last line break should have been a soft line break Reset the 1545 | // counter for matched blocks 1546 | s->matched = 0; 1547 | // If there is at least one open block, we should be in the matching 1548 | // state. Also set the matching flag if a 1549 | // `$._soft_line_break_marker` can be emitted so it does get 1550 | // emitted. 1551 | if (s->open_blocks.size > 0) { 1552 | s->state |= STATE_MATCHING; 1553 | } else { 1554 | s->state &= (~STATE_MATCHING); 1555 | } 1556 | // reset some state variables 1557 | s->state &= (~STATE_WAS_SOFT_LINE_BREAK); 1558 | lexer->result_symbol = LINE_ENDING; 1559 | return true; 1560 | } 1561 | } 1562 | return false; 1563 | } 1564 | 1565 | void *tree_sitter_markdown_external_scanner_create(void) { 1566 | Scanner *s = (Scanner *)malloc(sizeof(Scanner)); 1567 | s->open_blocks.items = (Block *)calloc(1, sizeof(Block)); 1568 | #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) 1569 | _Static_assert(ATX_H6_MARKER == ATX_H1_MARKER + 5, ""); 1570 | #else 1571 | assert(ATX_H6_MARKER == ATX_H1_MARKER + 5); 1572 | #endif 1573 | deserialize(s, NULL, 0); 1574 | 1575 | return s; 1576 | } 1577 | 1578 | bool tree_sitter_markdown_external_scanner_scan(void *payload, TSLexer *lexer, 1579 | const bool *valid_symbols) { 1580 | Scanner *scanner = (Scanner *)payload; 1581 | scanner->simulate = false; 1582 | return scan(scanner, lexer, valid_symbols); 1583 | } 1584 | 1585 | unsigned tree_sitter_markdown_external_scanner_serialize(void *payload, 1586 | char *buffer) { 1587 | Scanner *scanner = (Scanner *)payload; 1588 | return serialize(scanner, buffer); 1589 | } 1590 | 1591 | void tree_sitter_markdown_external_scanner_deserialize(void *payload, 1592 | const char *buffer, 1593 | unsigned length) { 1594 | Scanner *scanner = (Scanner *)payload; 1595 | deserialize(scanner, buffer, length); 1596 | } 1597 | 1598 | void tree_sitter_markdown_external_scanner_destroy(void *payload) { 1599 | Scanner *scanner = (Scanner *)payload; 1600 | free(scanner->open_blocks.items); 1601 | free(scanner); 1602 | } 1603 | ```