Lexer.hs 5.39 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
-- Copyright (c) 2013, Kenton Varda <temporal@gmail.com>
-- All rights reserved.
--
-- Redistribution and use in source and binary forms, with or without
-- modification, are permitted provided that the following conditions are met:
--
-- 1. Redistributions of source code must retain the above copyright notice, this
--    list of conditions and the following disclaimer.
-- 2. Redistributions in binary form must reproduce the above copyright notice,
--    this list of conditions and the following disclaimer in the documentation
--    and/or other materials provided with the distribution.
--
-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

module Lexer (lexer) where

import Text.Parsec hiding (token, tokens)
import Text.Parsec.String
28
import Control.Monad (liftM, when)
29 30 31
import qualified Text.Parsec.Token as T
import Text.Parsec.Language (emptyDef)
import Token
32
import Data.Char (isUpper, isLower)
33 34

keywords =
35 36
    [ (VoidKeyword, "void")
    , (TrueKeyword, "true")
37 38
    , (FalseKeyword, "false")
    , (InKeyword, "in")
Kenton Varda's avatar
Kenton Varda committed
39
    , (OfKeyword, "of")
Kenton Varda's avatar
Kenton Varda committed
40
    , (OnKeyword, "on")
Kenton Varda's avatar
Kenton Varda committed
41 42 43 44
    , (AsKeyword, "as")
    , (WithKeyword, "with")
    , (FromKeyword, "from")
    , (ImportKeyword, "import")
45
    , (UsingKeyword, "using")
46 47
    , (ConstKeyword, "const")
    , (EnumKeyword, "enum")
Kenton Varda's avatar
Kenton Varda committed
48
    , (StructKeyword, "struct")
Kenton Varda's avatar
Kenton Varda committed
49
    , (UnionKeyword, "union")
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
    , (InterfaceKeyword, "interface")
    , (OptionKeyword, "option")
    ]

languageDef :: T.LanguageDef st
languageDef = emptyDef
    { T.commentLine = "#"
    , T.identStart = letter <|> char '_'
    , T.identLetter = alphaNum <|> char '_'
    , T.reservedNames = [name | (_, name) <- keywords]
    , T.opStart = T.opLetter languageDef
    , T.opLetter = fail "There are no operators."
    }

tokenParser = T.makeTokenParser languageDef

66
rawIdentifier  = T.identifier tokenParser
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
reserved       = T.reserved tokenParser
symbol         = T.symbol tokenParser
naturalOrFloat = T.naturalOrFloat tokenParser
braces         = T.braces tokenParser
parens         = T.parens tokenParser
brackets       = T.brackets tokenParser
whiteSpace     = T.whiteSpace tokenParser
stringLiteral  = T.stringLiteral tokenParser

keyword :: Parser Token
keyword = foldl1 (<|>) [reserved name >> return t | (t, name) <- keywords]

toLiteral :: Either Integer Double -> Token
toLiteral (Left i) = LiteralInt i
toLiteral (Right d) = LiteralFloat d

located :: Parser t -> Parser (Located t)
located p = do
    pos <- getPosition
    t <- p
    return (Located pos t)

89 90 91 92 93 94 95 96 97 98 99
isTypeName (c:_) = isUpper c
isTypeName _ = False

hasUppercaseAcronym (a:rest@(b:c:_)) =
    (isUpper a && isUpper b && not (isLower c)) || hasUppercaseAcronym rest
hasUppercaseAcronym (a:b:[]) = isUpper a && isUpper b
hasUppercaseAcronym _ = False

identifier :: Parser Token
identifier = do
    text <- rawIdentifier
100
    when (elem '_' text) $
101 102 103 104 105 106 107 108
        fail "Identifiers containing underscores are reserved for the implementation.  Use \
             \camelCase style for multi-word names."
    when (hasUppercaseAcronym text) $
        fail "Wrong style:  Only the first letter of an acronym should be capitalized.  \
             \Consistent style is necessary to allow code generators to sanely translate \
             \names into the target language's preferred style."
    return (if isTypeName text then TypeIdentifier text else Identifier text)

109 110 111 112 113
tokenSequence = do
    tokens <- many1 locatedToken
    endPos <- getPosition
    return (TokenSequence tokens endPos)

114 115
token :: Parser Token
token = keyword
116
    <|> identifier
117 118
    <|> liftM ParenthesizedList  (parens (sepBy tokenSequence (symbol ",")))
    <|> liftM BracketedList      (brackets (sepBy tokenSequence (symbol ",")))
119 120 121 122 123 124
    <|> liftM toLiteral          naturalOrFloat
    <|> liftM LiteralString      stringLiteral
    <|> liftM (const AtSign)     (symbol "@")
    <|> liftM (const Colon)      (symbol ":")
    <|> liftM (const Period)     (symbol ".")
    <|> liftM (const EqualsSign) (symbol "=")
Kenton Varda's avatar
Kenton Varda committed
125
    <|> liftM (const MinusSign)  (symbol "-")
Kenton Varda's avatar
Kenton Varda committed
126
    <|> liftM (const ExclamationPoint) (symbol "!")
127 128 129 130 131 132 133 134
    <?> "token"

locatedToken = located token

statementEnd :: Parser (Maybe [Located Statement])
statementEnd = (symbol ";" >>= \_ -> return Nothing)
           <|> (braces (many locatedStatement) >>= \statements -> return (Just statements))

135
compileStatement :: TokenSequence -> Maybe [Located Statement] -> Statement
136 137 138 139 140
compileStatement tokens Nothing = Line tokens
compileStatement tokens (Just statements) = Block tokens statements

statement :: Parser Statement
statement = do
141
    tokens <- tokenSequence
142 143 144 145 146 147 148 149 150 151 152
    end <- statementEnd
    return (compileStatement tokens end)

locatedStatement = located statement

lexer :: Parser [Located Statement]
lexer = do
    whiteSpace
    tokens <- many locatedStatement
    eof
    return tokens