Skip to content

Commit

Permalink
Rewrite parser add lexer step
Browse files Browse the repository at this point in the history
Closes #69
  • Loading branch information
kthompson committed Feb 16, 2024
1 parent 353671b commit cf5b6a2
Show file tree
Hide file tree
Showing 9 changed files with 635 additions and 253 deletions.
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ https://kthompson.github.io/glob/

### Common Expressions

| Pattern | Description |
|-----------|--------------------------------------------------------------------------------|
| taco* | matches any string beginning with taco |
| \*taco\* | matches any string containing taco |
| *taco | matches any string ending in taco |
| *.[ch] | matches any string ending in `.c` or `.h` |
| *.{gif,jpg} | match any string ending in `.gif` or `.jpg` |
| Pattern | Description |
|-------------|---------------------------------------------|
| taco* | matches any string beginning with taco |
| \*taco\* | matches any string containing taco |
| *taco | matches any string ending in taco |
| *.[ch] | matches any string ending in `.c` or `.h` |
| *.{gif,jpg} | match any string ending in `.gif` or `.jpg` |

### Expressions

Expand Down
6 changes: 1 addition & 5 deletions src/Glob/Glob.cs
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using GlobExpressions.AST;

namespace GlobExpressions
Expand Down Expand Up @@ -51,7 +47,7 @@ public bool IsMatch(string input)
if (_matchFilenameOnly && _segments!.Length == 1)
{
var last = pathSegments.LastOrDefault();
var tail = (last == null) ? new string[0] : new[] { last };
var tail = last == null ? Array.Empty<string>() : new[] { last };

if (GlobEvaluator.Eval(_segments, 0, tail, 0, _caseSensitive))
return true;
Expand Down
4 changes: 4 additions & 0 deletions src/Glob/Glob.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,8 @@
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
</PropertyGroup>

<ItemGroup>
<Folder Include="Text" />
</ItemGroup>

</Project>
305 changes: 305 additions & 0 deletions src/Glob/Lexer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,305 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace GlobExpressions;

enum SyntaxKind
{
EndOfInputToken,

CloseParenToken,
OpenParenToken,
OpenBraceToken, // {
CloseBraceToken, // }

CharacterSet, // [...]

QuestionToken, // ?
StarToken, // *
StarStarToken, // **
SlashToken,
CommaToken,

RootToken,
LiteralToken,
}

record Token(SyntaxKind Kind, TextSpan Span, object Value);

class Lexer
{
private readonly string _pattern;
private int _position;
private readonly Dictionary<char, Func<Token>> _lexFunctions = new();
private readonly HashSet<char> _nonIdentChars = new();
private readonly StringBuilder _currentIdentifier = new();

// state variable representing start position or -1 for none
private int _literalSetPos = -1;

public Lexer(string pattern)
{
_pattern = pattern;
InitializeLexer();
}

private void InitializeLexer()
{
_lexFunctions['{'] = ReturnOpenBraceToken;
_lexFunctions['}'] = ReturnCloseBraceToken;
_lexFunctions['['] = ReturnCharacterSetToken;
_lexFunctions[']'] = ReturnCloseBracketToken;
_lexFunctions['?'] = ReturnQuestionToken;
_lexFunctions['*'] = ReturnStarToken;
_lexFunctions['/'] = ReturnSlashToken;
_lexFunctions[','] = ReturnCommaToken;

foreach (char key in _lexFunctions.Keys)
_nonIdentChars.Add(key);
}

private void CheckExtendedGlob(char current)
{
if (current != '(') return;

// stub support for extended globs if we ever want to support it
switch (Lookahead)
{
case '?':
case '*':
case '+':
case '@':
case '!':
throw new GlobPatternException("Extended glob patterns are not currently supported");

default:
return;
}
}

private Token ReturnOpenBraceToken()
{
if (InLiteralSet)
throw new GlobPatternException($"Invalid nested literal set at offset {_position}");

_literalSetPos = _position;
return ReturnKindOneChar(SyntaxKind.OpenBraceToken);
}

private Token ReturnCloseBraceToken()
{
if (!InLiteralSet)
throw new GlobPatternException($"Invalid literal set terminator at offset {_position}");

_literalSetPos = -1;
return ReturnKindOneChar(SyntaxKind.CloseBraceToken);
}

private Token ReturnCloseBracketToken()
{
throw new GlobPatternException($"Invalid character set terminator at offset {_position}");
}

private Token ReturnCharacterSetToken()
{
_position++; // accept [

var start = _position;
var inverted = false;


if (Current == null)
{
throw new GlobPatternException($"Unterminated character set at offset {start}");
}

if(Current.Value == '!')
{
_position++;
start++; // dont count the `!` in the character set
inverted = true;
}

if (Current == null)
{
throw new GlobPatternException($"Unterminated character set at offset {start}");
}

// first token is special and we allow more things like ] or [ at the beginning
if (Current.Value == ']')
{
_position++;
}

while (true)
{
if (Current == null)
{
throw new GlobPatternException($"Unterminated character set at offset {start}");
}

if (Current.Value != ']')
{
_position++;
continue;
}

break;
}

var token = new Token(SyntaxKind.CharacterSet, TextSpan.FromBounds(start, _position), inverted);

_position++; // accept `]`

return token;
}

private bool InLiteralSet => _literalSetPos >= 0;

private Token ReturnQuestionToken() => ReturnKindOneChar(SyntaxKind.QuestionToken);

private Token ReturnStarToken() => Lookahead == '*'
? ReturnKindTwoChar(SyntaxKind.StarStarToken)
: ReturnKindOneChar(SyntaxKind.StarToken);

private Token ReturnCommaToken() => ReturnKindOneChar(SyntaxKind.CommaToken);
private Token ReturnSlashToken() => ReturnKindOneChar(SyntaxKind.SlashToken);

private Token ReturnEndOfInput() =>
new Token(SyntaxKind.EndOfInputToken, TextSpan.FromBounds(_position, _position), string.Empty);
private Token ReturnKindOneChar(SyntaxKind kind)
{
var start = _position;
_position++;
return new Token(kind, TextSpan.FromBounds(start, _position), string.Empty);
}

private Token ReturnKindTwoChar(SyntaxKind kind)
{
var start = _position;
_position += 2;
return new Token(kind, TextSpan.FromBounds(start, _position), string.Empty);
}

private char? Current => Peek(_position);
private char? Lookahead => Peek(_position + 1);
private char? Peek(int position) => position >= _pattern.Length ? null : _pattern[position];

private bool IsIdentCharacter(char current, bool inCharacterSet)
{
// if we are in a literal set we parse commas as their own token, otherwise
// they are considered an identifier character
if (current == ',')
return !InLiteralSet;

// character wildcards are treated as ident characters in character sets
if (current == '?')
return inCharacterSet;

return !_nonIdentChars.Contains(current);
}

private Token ParseIdentToken(bool inCharacterSet)
{
var start = _position;
_currentIdentifier.Clear();

while (true)
{
if (Current == null)
break;

if (Current == '\\')
{
var escapeSequence = ParseEscapeSequence(inCharacterSet);
_currentIdentifier.Append(escapeSequence);
}
else if (IsIdentCharacter(Current.Value, inCharacterSet))
{
_currentIdentifier.Append(Current.Value);
_position++;
}
else
{
break;
}
}

return new Token(SyntaxKind.LiteralToken, TextSpan.FromBounds(start, _position), _currentIdentifier.ToString());
}

private string ParseEscapeSequence(bool inCharacterSet)
{
_position++; // accept \
switch (Current)
{
case '*':
case '?':
case '{':
case '}':
case '[':
case ']':
case '(':
case ')':
case ' ':
case ',' when inCharacterSet:
var result = Current.Value.ToString();
_position++;
return result;

default:
throw new GlobPatternException(
$"Expected escape sequence at index pattern `{_position}` but found `\\{Current}`");
}
}

public Token ParseToken()
{
if (Current == null) return ReturnEndOfInput();

if (_position == 0)
{
var token = TryParseRootToken();
if (token != null)
return token;
}

CheckExtendedGlob(Current.Value);

if (IsIdentCharacter(Current.Value, false))
return ParseIdentToken(false);

if (_lexFunctions.TryGetValue(Current.Value, out var function))
{
return function();
}

throw new GlobPatternException($"Unexpected character {Current} at index {_position}");
}

private Token? TryParseRootToken()
{
if (Current == null)
{
return null;
}

// osx/linux root
if (Current == '/')
{
_position += 1;
return new Token(SyntaxKind.RootToken, TextSpan.FromBounds(0, 0), string.Empty);
}

// windows root
if (char.IsLetter(Current.Value) && Lookahead == ':')
{
_position += 2;
return new Token(SyntaxKind.RootToken, TextSpan.FromBounds(0, 2), string.Empty);
}

return null;
}
}
Loading

0 comments on commit cf5b6a2

Please sign in to comment.