nixd
Loading...
Searching...
No Matches
SourceCode.cpp
Go to the documentation of this file.
2#include "lspserver/Logger.h"
3#include <llvm/ADT/StringExtras.h>
4#include <llvm/Support/Errc.h>
5
6namespace lspserver {
7
8/// TODO: support more encodings (from clangd, using Context)
9static OffsetEncoding lspEncoding() { return OffsetEncoding::UTF16; }
10
11template <typename Callback>
12static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) {
13 bool LoggedInvalid = false;
14 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
15 // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
16 for (size_t I = 0; I < U8.size();) {
17 unsigned char C = static_cast<unsigned char>(U8[I]);
18 if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character.
19 if (CB(1, 1))
20 return true;
21 ++I;
22 continue;
23 }
24 // This convenient property of UTF-8 holds for all non-ASCII characters.
25 size_t UTF8Length = llvm::countl_one(C);
26 // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here.
27 // 11111xxx is not valid UTF-8 at all, maybe some ISO-8859-*.
28 if (LLVM_UNLIKELY(UTF8Length < 2 || UTF8Length > 4)) {
29 if (!LoggedInvalid) {
30 elog("File has invalid UTF-8 near offset {0}: {1}", I, llvm::toHex(U8));
31 LoggedInvalid = true;
32 }
33 // We can't give a correct result, but avoid returning something wild.
34 // Pretend this is a valid ASCII byte, for lack of better options.
35 // (Too late to get ISO-8859-* right, we've skipped some bytes already).
36 if (CB(1, 1))
37 return true;
38 ++I;
39 continue;
40 }
41 I += UTF8Length; // Skip over all trailing bytes.
42 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
43 // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...)
44 if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1))
45 return true;
46 }
47 return false;
48}
49
50// Like most strings in clangd, the input is UTF-8 encoded.
51size_t lspLength(llvm::StringRef Code) {
52 size_t Count = 0;
53 switch (lspEncoding()) {
55 Count = Code.size();
56 break;
58 iterateCodepoints(Code, [&](int U8Len, int U16Len) {
59 Count += U16Len;
60 return false;
61 });
62 break;
64 iterateCodepoints(Code, [&](int U8Len, int U16Len) {
65 ++Count;
66 return false;
67 });
68 break;
70 llvm_unreachable("unsupported encoding");
71 }
72 return Count;
73}
74
75// Returns the byte offset into the string that is an offset of \p Units in
76// the specified encoding.
77// Conceptually, this converts to the encoding, truncates to CodeUnits,
78// converts back to UTF-8, and returns the length in bytes.
79static size_t measureUnits(llvm::StringRef U8, int Units, OffsetEncoding Enc,
80 bool &Valid) {
81 Valid = Units >= 0;
82 if (Units <= 0)
83 return 0;
84 size_t Result = 0;
85 switch (Enc) {
87 Result = Units;
88 break;
90 Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) {
91 Result += U8Len;
92 Units -= U16Len;
93 return Units <= 0;
94 });
95 if (Units < 0) // Offset in the middle of a surrogate pair.
96 Valid = false;
97 break;
99 Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) {
100 Result += U8Len;
101 Units--;
102 return Units <= 0;
103 });
104 break;
106 llvm_unreachable("unsupported encoding");
107 }
108 // Don't return an out-of-range index if we overran.
109 if (Result > U8.size()) {
110 Valid = false;
111 return U8.size();
112 }
113 return Result;
114}
115
116llvm::Expected<size_t> positionToOffset(llvm::StringRef Code, Position P,
117 bool AllowColumnsBeyondLineLength) {
118 if (P.line < 0)
119 return error(llvm::errc::invalid_argument,
120 "Line value can't be negative ({0})", P.line);
121 if (P.character < 0)
122 return error(llvm::errc::invalid_argument,
123 "Character value can't be negative ({0})", P.character);
124 size_t StartOfLine = 0;
125 for (int I = 0; I != P.line; ++I) {
126 size_t NextNL = Code.find('\n', StartOfLine);
127 if (NextNL == llvm::StringRef::npos)
128 return error(llvm::errc::invalid_argument,
129 "Line value is out of range ({0})", P.line);
130 StartOfLine = NextNL + 1;
131 }
132 llvm::StringRef Line =
133 Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; });
134
135 // P.character may be in UTF-16, transcode if necessary.
136 bool Valid;
137 size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid);
138 if (!Valid && !AllowColumnsBeyondLineLength)
139 return error(llvm::errc::invalid_argument,
140 "{0} offset {1} is invalid for line {2}", lspEncoding(),
141 P.character, P.line);
142 return StartOfLine + ByteInLine;
143}
144
145Position offsetToPosition(llvm::StringRef Code, size_t Offset) {
146 Offset = std::min(Code.size(), Offset);
147 llvm::StringRef Before = Code.substr(0, Offset);
148 int Lines = Before.count('\n');
149 size_t PrevNL = Before.rfind('\n');
150 size_t StartOfLine = (PrevNL == llvm::StringRef::npos) ? 0 : (PrevNL + 1);
151 Position Pos;
152 Pos.line = Lines;
153 Pos.character = lspLength(Before.substr(StartOfLine));
154 return Pos;
155}
156
157// Workaround for editors that have buggy handling of newlines at end of file.
158//
159// The editor is supposed to expose document contents over LSP as an exact
160// string, with whitespace and newlines well-defined. But internally many
161// editors treat text as an array of lines, and there can be ambiguity over
162// whether the last line ends with a newline or not.
163//
164// This confusion can lead to incorrect edits being sent. Failing to apply them
165// is catastrophic: we're desynced, LSP has no mechanism to get back in sync.
166// We apply a heuristic to avoid this state.
167//
168// If our current view of an N-line file does *not* end in a newline, but the
169// editor refers to the start of the next line (an impossible location), then
170// we silently add a newline to make this valid.
171// We will still validate that the rangeLength is correct, *including* the
172// inferred newline.
173//
174// See https://github.com/neovim/neovim/issues/17085
175static void inferFinalNewline(llvm::Expected<size_t> &Err,
176 std::string &Contents, const Position &Pos) {
177 if (Err)
178 return;
179 if (!Contents.empty() && Contents.back() == '\n')
180 return;
181 if (Pos.character != 0)
182 return;
183 if (Pos.line != llvm::count(Contents, '\n') + 1)
184 return;
185 log("Editor sent invalid change coordinates, inferring newline at EOF");
186 Contents.push_back('\n');
187 consumeError(Err.takeError());
188 Err = Contents.size();
189}
190
191llvm::Error applyChange(std::string &Contents,
192 const TextDocumentContentChangeEvent &Change) {
193 if (!Change.range) {
194 Contents = Change.text;
195 return llvm::Error::success();
196 }
197
198 const Position &Start = Change.range->start;
199 llvm::Expected<size_t> StartIndex = positionToOffset(Contents, Start, false);
200 inferFinalNewline(StartIndex, Contents, Start);
201 if (!StartIndex)
202 return StartIndex.takeError();
203
204 const Position &End = Change.range->end;
205 llvm::Expected<size_t> EndIndex = positionToOffset(Contents, End, false);
206 inferFinalNewline(EndIndex, Contents, End);
207 if (!EndIndex)
208 return EndIndex.takeError();
209
210 if (*EndIndex < *StartIndex)
211 return error(llvm::errc::invalid_argument,
212 "Range's end position ({0}) is before start position ({1})",
213 End, Start);
214
215 // Since the range length between two LSP positions is dependent on the
216 // contents of the buffer we compute the range length between the start and
217 // end position ourselves and compare it to the range length of the LSP
218 // message to verify the buffers of the client and server are in sync.
219
220 // EndIndex and StartIndex are in bytes, but Change.rangeLength is in UTF-16
221 // code units.
222 ssize_t ComputedRangeLength =
223 lspLength(Contents.substr(*StartIndex, *EndIndex - *StartIndex));
224
225 if (Change.rangeLength && ComputedRangeLength != *Change.rangeLength)
226 return error(llvm::errc::invalid_argument,
227 "Change's rangeLength ({0}) doesn't match the "
228 "computed range length ({1}).",
229 *Change.rangeLength, ComputedRangeLength);
230
231 Contents.replace(*StartIndex, *EndIndex - *StartIndex, Change.text);
232
233 return llvm::Error::success();
234}
235} // namespace lspserver
Whether current platform treats paths case insensitively.
Definition Connection.h:11
llvm::unique_function< void(llvm::Expected< T >)> Callback
Definition Function.h:14
llvm::Error error(std::error_code EC, const char *Fmt, Ts &&...Vals)
Definition Logger.h:70
llvm::Error applyChange(std::string &Contents, const TextDocumentContentChangeEvent &Change)
Apply an incremental update to a text document.
size_t lspLength(llvm::StringRef Code)
void elog(const char *Fmt, Ts &&...Vals)
Definition Logger.h:52
Position offsetToPosition(llvm::StringRef Code, size_t Offset)
llvm::Expected< size_t > positionToOffset(llvm::StringRef Code, Position P, bool AllowColumnsBeyondLineLength=true)
void log(const char *Fmt, Ts &&...Vals)
Definition Logger.h:58
int64_t line
Line position in a document (zero-based).
std::string text
The new text of the range/document.
std::optional< Range > range
The range of the document that changed.
std::optional< int > rangeLength
The length of the range that got replaced.