#region Copyright and License // // Fizzler - CSS Selector Engine for Microsoft .NET Framework // Copyright (c) 2009 Atif Aziz, Colin Ramsay. All rights reserved. // // This library is free software; you can redistribute it and/or modify it under // the terms of the GNU Lesser General Public License as published by the Free // Software Foundation; either version 3 of the License, or (at your option) // any later version. // // This library is distributed in the hope that it will be useful, but WITHOUT // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more // details. // // You should have received a copy of the GNU Lesser General Public License // along with this library; if not, write to the Free Software Foundation, Inc., // 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // #endregion namespace Fizzler { #region Imports using System; using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Text; #endregion /// /// Lexer for tokens in CSS selector grammar. /// public static class Tokener { /// /// Parses tokens from a given text source. /// public static IEnumerable Tokenize(TextReader reader) { if (reader == null) throw new ArgumentNullException("reader"); return Tokenize(reader.ReadToEnd()); } /// /// Parses tokens from a given string. /// public static IEnumerable Tokenize(string input) { var reader = new Reader(input ?? string.Empty); while (reader.Read() != null) { var ch = reader.Value; // // Identifier or function // if (ch == '-' || IsNmStart(ch)) { reader.Mark(); if (reader.Value == '-') { if (!IsNmStart(reader.Read())) throw new FormatException(string.Format("Invalid identifier at position {0}.", reader.Position)); } while (IsNmChar(reader.Read())) { /* NOP */ } if (reader.Value == '(') yield return Token.Function(reader.Marked()); else yield return Token.Ident(reader.MarkedWithUnread()); } // // Integer // else if (IsDigit(ch)) { reader.Mark(); do { /* NOP */ } while (IsDigit(reader.Read())); yield return Token.Integer(reader.MarkedWithUnread()); } // // Whitespace, including that which is coupled with some punctuation // else if (IsS(ch)) { var space = ParseWhiteSpace(reader); ch = reader.Read(); switch (ch) { case ',': yield return Token.Comma(); break; case '+': yield return Token.Plus(); break; case '>': yield return Token.Greater(); break; case '~': yield return Token.Tilde(); break; default: reader.Unread(); yield return Token.WhiteSpace(space); break; } } else switch(ch) { case '*': // * or *= case '~': // ~ or ~= case '|': // | or |= { if (reader.Read() == '=') { yield return ch == '*' ? Token.SubstringMatch() : ch == '|' ? Token.DashMatch() : Token.Includes(); } else { reader.Unread(); yield return ch == '*' || ch == '|' ? Token.Char(ch.Value) : Token.Tilde(); } break; } case '^': // ^= case '$': // $= { if (reader.Read() != '=') throw new FormatException(string.Format("Invalid character at position {0}.", reader.Position)); switch (ch) { case '^': yield return Token.PrefixMatch(); break; case '$': yield return Token.SuffixMatch(); break; } break; } // // Single-character punctuation // case '.': yield return Token.Dot(); break; case ':': yield return Token.Colon(); break; case ',': yield return Token.Comma(); break; case '=': yield return Token.Equals(); break; case '[': yield return Token.LeftBracket(); break; case ']': yield return Token.RightBracket(); break; case ')': yield return Token.RightParenthesis(); break; case '+': yield return Token.Plus(); break; case '>': yield return Token.Greater(); break; case '#': yield return Token.Hash(ParseHash(reader)); break; // // Single- or double-quoted strings // case '\"': case '\'': yield return ParseString(reader, /* quote */ ch.Value); break; default: throw new FormatException(string.Format("Invalid character at position {0}.", reader.Position)); } } yield return Token.Eoi(); } private static string ParseWhiteSpace(Reader reader) { Debug.Assert(reader != null); reader.Mark(); while (IsS(reader.Read())) { /* NOP */ } return reader.MarkedWithUnread(); } private static string ParseHash(Reader reader) { Debug.Assert(reader != null); reader.MarkFromNext(); // skipping # while (IsNmChar(reader.Read())) { /* NOP */ } var text = reader.MarkedWithUnread(); if (text.Length == 0) throw new FormatException(string.Format("Invalid hash at position {0}.", reader.Position)); return text; } private static Token ParseString(Reader reader, char quote) { Debug.Assert(reader != null); // // TODO Support full string syntax! // // string {string1}|{string2} // string1 \"([^\n\r\f\\"]|\\{nl}|{nonascii}|{escape})*\" // string2 \'([^\n\r\f\\']|\\{nl}|{nonascii}|{escape})*\' // nonascii [^\0-\177] // escape {unicode}|\\[^\n\r\f0-9a-f] // unicode \\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])? // var strpos = reader.Position; reader.MarkFromNext(); // skipping quote char? ch; StringBuilder sb = null; while ((ch = reader.Read()) != quote) { if (ch == null) throw new FormatException(string.Format("Unterminated string at position {0}.", strpos)); if (ch == '\\') { ch = reader.Read(); // // NOTE: Only escaping of quote and backslash supported! // if (ch != quote && ch != '\\') throw new FormatException(string.Format("Invalid escape sequence at position {0} in a string at position {1}.", reader.Position, strpos)); if (sb == null) sb = new StringBuilder(); sb.Append(reader.MarkedExceptLast()); reader.Mark(); } } var text = reader.Marked(); if (sb != null) text = sb.Append(text).ToString(); return Token.String(text); } private static bool IsDigit(char? ch) // [0-9] { return ch >= '0' && ch <= '9'; } private static bool IsS(char? ch) // [ \t\r\n\f] { return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || ch == '\f'; } private static bool IsNmStart(char? ch) // [_a-z]|{nonascii}|{escape} { return ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); } private static bool IsNmChar(char? ch) // [_a-z0-9-]|{nonascii}|{escape} { return IsNmStart(ch) || ch == '-' || (ch >= '0' && ch <= '9'); } private sealed class Reader { private readonly string _input; private int _index = -1; private int _start = -1; public Reader(string input) { _input = input; } private bool Ready { get { return _index >= 0 && _index < _input.Length; } } public char? Value { get { return Ready ? _input[_index] : (char?)null; } } public int Position { get { return _index + 1; } } public void Mark() { _start = _index; } public void MarkFromNext() { _start = _index + 1; } public string Marked() { return Marked(0); } public string MarkedExceptLast() { return Marked(-1); } private string Marked(int trim) { var start = _start; var count = Math.Min(_input.Length, _index + trim) - start; return count > 0 ? _input.Substring(start, count) : string.Empty; } public char? Read() { _index = Position >= _input.Length ? _input.Length : _index + 1; return Value; } public void Unread() { _index = Math.Max(-1, _index - 1); } public string MarkedWithUnread() { var text = Marked(); Unread(); return text; } } } }