1 // This file is part of Visual D 2 // 3 // Visual D integrates the D programming language into Visual Studio 4 // Copyright (c) 2010 by Rainer Schuetze, All Rights Reserved 5 // 6 // Distributed under the Boost Software License, Version 1.0. 7 // See accompanying file LICENSE.txt or copy at http://www.boost.org/LICENSE_1_0.txt 8 9 module c2d.tokenizer; 10 11 import std.ascii; 12 import std.string; 13 import std.utf; 14 15 version = V2; 16 // version = Java; 17 // version = IDL; 18 version = dollar_in_ident; 19 20 class Token 21 { 22 // very basic C++ tokenizer, interested only in: 23 enum { 24 Comment, 25 Newline, 26 Identifier, 27 Number, 28 String, 29 30 Namespace, 31 Struct, 32 Class, 33 Union, 34 Enum, 35 36 Typedef, // 10 37 Extern, 38 Static, 39 Const, 40 __In, 41 42 __Out, 43 __Body, 44 __Asm, 45 __Declspec, 46 If, 47 48 Else, // 20 49 Do, 50 While, 51 For, 52 Return, 53 54 Break, 55 Continue, 56 Switch, 57 Goto, 58 Delete, 59 60 BraceL, // 30 61 BraceR, 62 BracketL, 63 BracketR, 64 ParenL, 65 ParenR, 66 67 Equal, 68 Unequal, 69 LessThan, 70 LessEq, 71 72 GreaterThan, // 40 73 GreaterEq, 74 Unordered, 75 LessGreater, 76 LessEqGreater, 77 78 UnordGreater, 79 UnordGreaterEq, 80 UnordLess, 81 UnordLessEq, 82 UnordEq, 83 84 Shl, // 50 85 Shr, 86 Comma, 87 Asterisk, 88 Ampersand, 89 90 Assign, 91 Dot, 92 Elipsis, 93 Colon, 94 DoubleColon, 95 96 Semicolon, // 60 97 Tilde, 98 Question, 99 Exclamation, 100 Deref, 101 102 Plus, 103 PlusPlus, 104 Minus, 105 MinusMinus, 106 Div, 107 108 Mod, // 70 109 Xor, 110 Or, 111 OrOr, 112 AmpAmpersand, 113 114 AddAsgn, 115 SubAsgn, 116 MulAsgn, 117 DivAsgn, 118 ModAsgn, 119 120 AndAsgn, // 80 121 XorAsgn, 122 OrAsgn, 123 ShlAsgn, 124 ShrAsgn, 125 126 PPinclude, 127 PPdefine, 128 PPundef, 129 PPif, 130 PPifdef, 131 132 PPifndef, // 90 133 PPelse, 134 PPelif, 135 PPendif, 136 PPother, 137 PPinsert, // helper for reparsing 138 139 Fis, 140 FisFis, 141 Macro, 142 Other, 143 144 EOF, // 100 145 V1Tokens 146 } 147 148 version(V2) 149 { 150 enum { 151 New = V1Tokens, 152 Static_if, 153 Mixin, 154 Case, 155 Default, 156 Operator, 157 Version, 158 Sizeof, 159 This, 160 Static_cast, 161 Dynamic_cast, 162 Reinterpret_cast, 163 Const_cast, 164 Empty, // helper for unspecified identifier in declaration 165 Interface, 166 Template, 167 } 168 } 169 version(Java) 170 { 171 enum { 172 Instanceof = V1Tokens, 173 } 174 } 175 static bool isPPToken(int type) 176 { 177 switch(type) 178 { 179 case PPinclude, PPdefine, PPundef, PPif, PPifdef, PPifndef, 180 PPelse, PPelif, PPendif, PPother: 181 return true; 182 default: 183 return false; 184 } 185 } 186 187 static bool needsTrailingSemicolon(int type) 188 { 189 switch(type) 190 { 191 case Class, Struct, Union, Enum, Typedef: 192 return true; 193 default: 194 return false; 195 } 196 } 197 198 static string toString(int type) 199 { 200 switch(type) 201 { 202 case Namespace: return "namespace"; 203 case Struct: return "struct"; 204 case Class: return "class"; 205 case Union: return "union"; 206 case Enum: return "enum"; 207 case Typedef: return "typedef"; 208 case Extern: return "extern"; 209 case Static: return "static"; 210 case Const: return "const"; 211 case __In: return "__in"; 212 case __Out: return "__out"; 213 case __Body: return "__body"; 214 215 case __Asm: return "__asm"; 216 case __Declspec: return "__declspec"; 217 case If: return "if"; 218 case Else: return "else"; 219 case Do: return "do"; 220 case While: return "while"; 221 case For: return "for"; 222 case Return: return "return"; 223 case Break: return "break"; 224 case Continue: return "continue"; 225 case Switch: return "switch"; 226 case Goto: return "goto"; 227 case Delete: return "delete"; 228 229 case BraceL: return "{"; 230 case BraceR: return "}"; 231 case BracketL: return "["; 232 case BracketR: return "]"; 233 case ParenL: return "("; 234 case ParenR: return ")"; 235 236 case Equal: return "=="; 237 case Unequal: return "!="; 238 case LessThan: return "<"; 239 case LessEq: return "<="; 240 case GreaterThan: return ">"; 241 case GreaterEq: return ">="; 242 243 case Unordered: return "!<>="; 244 case LessGreater: return "<>"; 245 case LessEqGreater: return "<>="; 246 case UnordGreater: return "!<="; 247 case UnordGreaterEq: return "!<"; 248 case UnordLess: return "!>="; 249 case UnordLessEq: return "!>"; 250 case UnordEq: return "!<>"; 251 252 case Shl: return "<<"; 253 case Shr: return ">>"; 254 case Comma: return ","; 255 case Asterisk: return "*"; 256 case Ampersand: return "&"; 257 case Assign: return "="; 258 case Dot: return "."; 259 case Elipsis: return "..."; 260 case Colon: return ":"; 261 case DoubleColon: return "::"; 262 case Semicolon: return ";"; 263 case Tilde: return "~"; 264 case Question: return "?"; 265 case Exclamation: return "!"; 266 case Deref: return "->"; 267 case Plus: return "+"; 268 case PlusPlus: return "++"; 269 case Minus: return "-"; 270 case MinusMinus: return "--"; 271 case Div: return "/"; 272 case Mod: return "%"; 273 case Xor: return "^"; 274 case Or: return "|"; 275 case OrOr: return "||"; 276 case AmpAmpersand: return "&&"; 277 case AddAsgn: return "+="; 278 case SubAsgn: return "-="; 279 case MulAsgn: return "*="; 280 case DivAsgn: return "/="; 281 case ModAsgn: return "%="; 282 case AndAsgn: return "&="; 283 case XorAsgn: return "^="; 284 case OrAsgn: return "|="; 285 case ShlAsgn: return "<<="; 286 case ShrAsgn: return ">>="; 287 288 case PPinclude: return "#include"; 289 case PPdefine: return "#define"; 290 case PPundef: return "#undef"; 291 case PPif: return "#if"; 292 case PPifdef: return "#ifdef"; 293 case PPifndef: return "#ifndef"; 294 case PPelse: return "#else"; 295 case PPelif: return "#elif"; 296 case PPendif: return "#endif"; 297 298 case Fis: return "#"; 299 case FisFis: return "##"; 300 301 version(V2) 302 { 303 case New: return "new"; 304 case Static_if: return "__static_if"; 305 case Mixin: return "__mixin"; 306 case Case: return "case"; 307 case Default: return "default"; 308 case Operator: return "operator"; 309 case Version: return "version"; 310 case Sizeof: return "sizeof"; 311 case This: return "this"; 312 case Static_cast: return "static_cast"; 313 case Dynamic_cast: return "dynamic_cast"; 314 case Reinterpret_cast: return "reinterpret_cast"; 315 case Const_cast: return "const_cast"; 316 case Empty: return ""; 317 case Newline: return "\n"; 318 case Interface: return "interface"; 319 case Template: return "template"; 320 321 } 322 version(Java) 323 { 324 case Instanceof: return "instanceof"; 325 } 326 case Identifier: return "<identifier>"; 327 case Number: return "<number>"; 328 case String: return "<string>"; 329 case EOF: return "EOF"; 330 331 // other types supposed to fail because no representation available 332 case Macro: 333 case PPinsert: 334 case Comment: 335 case PPother: 336 case Other: 337 default: 338 assert(type == EOF); // always fails 339 return "<unexpected>"; 340 } 341 } 342 343 int type; 344 int lineno; 345 string text; 346 string pretext; 347 } 348 349 /////////////////////////////////////////////////////////////////////// 350 351 bool contains(T)(ref T[] arr, T val) 352 { 353 foreach(T t; arr) 354 if (t == val) 355 return true; 356 return false; 357 } 358 359 void addunique(T)(ref T[] arr, T val) 360 { 361 if (!contains(arr, val)) 362 arr ~= val; 363 } 364 365 /////////////////////////////////////////////////////////////////////// 366 367 class Tokenizer 368 { 369 this(string txt) 370 { 371 text = txt; 372 reinit(); 373 } 374 375 void reinit() 376 { 377 lastIndent = ""; 378 countTokens = 0; 379 pos = 0; 380 if(text.length >= 3 && text[0] == 0xef && text[1] == 0xbb && text[2] == 0xbf) 381 pos += 3; // skip utf8 header 382 lineno = 1; 383 lastCharWasNewline = true; 384 skipNewline = true; 385 keepBackSlashAtEOL = false; 386 enableASMComment = false; 387 } 388 389 void pushText(string txt) 390 { 391 if(txt.length > 0) 392 { 393 if (pos < text.length) 394 { 395 txtstack ~= text; 396 posstack ~= pos; 397 } 398 text = txt; 399 pos = 0; 400 } 401 } 402 bool popText() 403 { 404 if(txtstack.length <= 0) 405 return false; 406 text = txtstack[$-1]; 407 pos = posstack[$-1]; 408 409 txtstack.length = txtstack.length - 1; 410 posstack.length = posstack.length - 1; 411 return true; 412 } 413 414 bool eof() 415 { 416 return pos >= text.length && txtstack.length <= 0; 417 } 418 bool eof(int n) 419 { 420 // this call is used to check for a close newline, so it does not need to check the text stack 421 return pos + n >= text.length; 422 } 423 424 bool isNewline() 425 { 426 if (text[pos] == '\n' || text[pos] == '\r') 427 return true; 428 return false; 429 } 430 431 void incPos() 432 { 433 pos++; 434 if (pos >= text.length) 435 popText(); 436 } 437 438 bool handleBackSlash() 439 { 440 if (eof(1) || text[pos] != '\\') 441 return false; 442 443 while (!eof(1) && text[pos] == '\\') 444 { 445 if (text[pos+1] == '\r' && !eof(2) && text[pos+2] == '\n') 446 { 447 lineno++; 448 incPos(); 449 incPos(); 450 incPos(); 451 } 452 else if (text[pos+1] == '\n') 453 { 454 lineno++; 455 incPos(); 456 incPos(); 457 } 458 else 459 return false; 460 if(keepBackSlashAtEOL) 461 curText ~= "\\\n"; 462 } 463 return true; 464 } 465 466 bool nextChar() 467 { 468 if (eof()) 469 return false; 470 471 handleBackSlash(); 472 if (text[pos] == '\r' && !eof(1) && text[pos+1] == '\n') 473 { 474 lineno++; 475 incPos(); 476 lastCharWasNewline = true; 477 } 478 else if (text[pos] == '\n') 479 { 480 lineno++; 481 lastCharWasNewline = true; 482 } 483 else 484 lastCharWasNewline = false; 485 curText ~= text[pos]; 486 incPos(); 487 if (eof()) 488 return false; 489 490 return true; 491 } 492 493 int skipSpace() 494 { 495 bool collectIndent = lastCharWasNewline; 496 if(collectIndent) 497 lastIndent = ""; 498 499 int lines = lineno; 500 handleBackSlash(); 501 cont_spaces: 502 while(!eof() && isWhite(text[pos])) 503 { 504 if (isNewline()) 505 { 506 if (!skipNewline) 507 break; 508 else 509 { 510 collectIndent = true; 511 lastIndent = ""; 512 } 513 } 514 else if(collectIndent) 515 lastIndent ~= text[pos]; 516 517 nextChar(); 518 } 519 if (!keepBackSlashAtEOL) 520 { 521 if(!eof(2) && text[pos] == '\\' && (text[pos+1] == '\n' || text[pos+1] == '\r')) 522 { 523 nextChar(); 524 nextChar(); 525 goto cont_spaces; 526 } 527 } 528 else if (handleBackSlash()) 529 goto cont_spaces; 530 531 return lineno - lines; 532 } 533 534 void skipLine() 535 { 536 while(!eof() && !isNewline()) 537 nextChar(); 538 if(!eof() && skipNewline) 539 nextChar(); 540 } 541 542 bool skipString() 543 { 544 int sep = text[pos]; 545 nextChar(); 546 while(!eof() && text[pos] != sep) 547 { 548 version(IDL) {} else { 549 if(isNewline()) 550 throw new Exception("newline in string constant"); 551 } 552 if(!handleBackSlash()) 553 { 554 if(text[pos] == '\\') 555 nextChar(); 556 nextChar(); 557 } 558 } 559 if (eof()) 560 return false; 561 nextChar(); 562 return true; 563 } 564 565 bool skipIdent() 566 { 567 if (eof()) 568 return false; 569 if(!isAlpha(text[pos]) && text[pos] != '_') 570 return false; 571 nextChar(); 572 return skipAlnum(); 573 } 574 575 bool skipAlnum() 576 { 577 version(dollar_in_ident) 578 while(!eof() && (isAlphaNum(text[pos]) || text[pos] == '_' || text[pos] == '$')) 579 nextChar(); 580 else 581 while(!eof() && (isAlphaNum(text[pos]) || text[pos] == '_')) 582 nextChar(); 583 return true; 584 } 585 586 bool skipNumber() 587 { 588 nextChar(); 589 skipAlnum(); 590 if(eof() || text[pos] != '.') 591 return true; 592 // float 593 nextChar(); 594 skipAlnum(); 595 if(text[pos-1] == 'E' || text[pos-1] == 'e' || text[pos-1] == 'P' || text[pos-1] == 'p') 596 if(text[pos] == '+' || text[pos] == '-') 597 { 598 nextChar(); 599 skipAlnum(); 600 } 601 return true; 602 } 603 604 void skipComment() 605 { 606 while(nextChar()) 607 { 608 if (text[pos] == '*' && pos + 1 < text.length && text[pos+1] == '/') 609 { 610 nextChar(); 611 nextChar(); 612 break; 613 } 614 } 615 } 616 617 int checkChar(int def, charTypes...)() 618 { 619 int ch = text[pos]; 620 int isChar = true; 621 bool found = false; 622 foreach(int ct; charTypes) 623 { 624 if(isChar) 625 found = (ct == ch); 626 else if(found) 627 { 628 nextChar(); 629 return ct; 630 } 631 isChar = !isChar; 632 } 633 return def; 634 } 635 636 int checkNextChar(int def, charTypes...)() 637 { 638 // we were always sitting on a valid character, and we don't want appending "\\\n", 639 // so we do the relevant parts of nextChar() here 640 lastCharWasNewline = false; 641 curText ~= text[pos]; 642 incPos(); 643 if(!eof()) 644 { 645 return checkChar!(def, charTypes); 646 } 647 return def; 648 } 649 int contNextChar(int iftype, charTypes...)(Token tok) 650 { 651 if(tok.type == iftype && !eof()) 652 { 653 tok.type = checkChar!(iftype, charTypes); 654 } 655 return tok.type; 656 } 657 658 static int identifierToKeyword(string ident) 659 { 660 switch(ident) 661 { 662 case "namespace": return Token.Namespace; 663 case "struct": return Token.Struct; 664 case "class": return Token.Class; 665 case "union": return Token.Union; 666 case "enum": return Token.Enum; 667 case "typedef": return Token.Typedef; 668 case "extern": return Token.Extern; 669 case "static": return Token.Static; 670 case "const": return Token.Const; 671 case "__in": return Token.__In; 672 case "__out": return Token.__Out; 673 case "__body": return Token.__Body; 674 case "_asm": return Token.__Asm; 675 case "__asm": return Token.__Asm; 676 case "__declspec": return Token.__Declspec; 677 case "if": return Token.If; 678 case "else": return Token.Else; 679 case "while": return Token.While; 680 case "do": return Token.Do; 681 case "for": return Token.For; 682 case "switch": return Token.Switch; 683 case "goto": return Token.Goto; 684 case "return": return Token.Return; 685 case "continue": return Token.Continue; 686 case "break": return Token.Break; 687 case "delete": return Token.Delete; 688 version(V2) 689 { 690 case "case": return Token.Case; 691 case "default": return Token.Default; 692 case "__static_if": return Token.Static_if; 693 case "__mixin": return Token.Mixin; 694 case "__version": return Token.Version; 695 case "sizeof": return Token.Sizeof; 696 case "operator": return Token.Operator; 697 case "new": return Token.New; 698 case "this": return Token.This; 699 case "static_cast": return Token.Static_cast; 700 case "dynamic_cast": return Token.Dynamic_cast; 701 case "reinterpret_cast": return Token.Reinterpret_cast; 702 case "const_cast": return Token.Const_cast; 703 case "interface": return Token.Interface; 704 case "template": return Token.Template; 705 } 706 version(Java) 707 { 708 case "instanceof": return Token.Instanceof; 709 } 710 default: return Token.Identifier; 711 } 712 } 713 714 bool next(Token tok) 715 { 716 curText = ""; 717 bool startOfLine = pos <= 0 || text[pos-1] == '\n' || text[pos-1] == '\r'; 718 if(skipSpace() > 0) 719 startOfLine = true; 720 721 tok.pretext = curText; 722 tok.lineno = lineno; 723 724 if(eof()) 725 { 726 tok.text = ""; 727 tok.type = Token.EOF; 728 return false; 729 } 730 731 curText = ""; 732 tok.type = Token.Other; 733 734 switch(text[pos]) 735 { 736 case '{': tok.type = Token.BraceL; nextChar(); break; 737 case '}': tok.type = Token.BraceR; nextChar(); break; 738 case '[': tok.type = Token.BracketL; nextChar(); break; 739 case ']': tok.type = Token.BracketR; nextChar(); break; 740 case '(': tok.type = Token.ParenL; nextChar(); break; 741 case ')': tok.type = Token.ParenR; nextChar(); break; 742 case ',': tok.type = Token.Comma; nextChar(); break; 743 case '~': tok.type = Token.Tilde; nextChar(); break; 744 case '?': tok.type = Token.Question; nextChar(); break; 745 case '\r': 746 case '\n': tok.type = Token.Newline; nextChar(); break; 747 748 case '=': tok.type = checkNextChar!(Token.Assign, '=', Token.Equal); break; 749 case '*': tok.type = checkNextChar!(Token.Asterisk, '=', Token.MulAsgn); break; 750 case '%': tok.type = checkNextChar!(Token.Mod, '=', Token.ModAsgn); break; 751 case '^': tok.type = checkNextChar!(Token.Xor, '=', Token.XorAsgn); break; 752 case '&': tok.type = checkNextChar!(Token.Ampersand, '=', Token.AndAsgn, '&', Token.AmpAmpersand); break; 753 case '|': tok.type = checkNextChar!(Token.Or, '=', Token.OrAsgn, '|', Token.OrOr); break; 754 case ':': tok.type = checkNextChar!(Token.Colon, ':', Token.DoubleColon); break; 755 case '-': tok.type = checkNextChar!(Token.Minus, '=', Token.SubAsgn, '>', Token.Deref, '-', Token.MinusMinus); break; 756 case '+': tok.type = checkNextChar!(Token.Plus, '=', Token.AddAsgn, '+', Token.PlusPlus); break; 757 758 case '<': 759 tok.type = checkNextChar!(Token.LessThan, '=', Token.LessEq, '<', Token.Shl, '>', Token.LessGreater); 760 contNextChar!(Token.Shl, '=', Token.ShlAsgn)(tok); 761 contNextChar!(Token.LessGreater, '=', Token.LessEqGreater)(tok); 762 break; 763 case '>': 764 tok.type = checkNextChar!(Token.GreaterThan, '=', Token.GreaterEq, '>', Token.Shr); 765 contNextChar!(Token.Shr, '=', Token.ShrAsgn)(tok); 766 break; 767 768 case '!': 769 // ! -> != !< !> 770 tok.type = checkNextChar!(Token.Exclamation, '=', Token.Unequal, '<', Token.UnordGreaterEq, '>', Token.UnordLessEq); 771 // !< -> !<= !<> 772 contNextChar!(Token.UnordGreaterEq, '=', Token.UnordGreater, '>', Token.UnordEq)(tok); 773 // !<> -> !<>= 774 contNextChar!(Token.UnordEq, '=', Token.Unordered)(tok); 775 // !> -> !>= 776 contNextChar!(Token.UnordLessEq, '=', Token.UnordLess)(tok); 777 break; 778 779 case '.': 780 tok.type = checkNextChar!(Token.Dot, '.', Token.Elipsis); 781 if(tok.type == Token.Elipsis) 782 { 783 if(text[pos] != '.') 784 throw new Exception("missing third '.' for '...'"); 785 nextChar(); 786 } 787 break; 788 789 case '#': 790 nextChar(); 791 if(!startOfLine) 792 { 793 if(text[pos] == '#') 794 { 795 tok.type = Token.FisFis; 796 nextChar(); 797 } 798 else 799 tok.type = Token.Fis; 800 } 801 else if(skipSpace() == 0) 802 { 803 int identpos = pos; 804 if (skipIdent()) 805 { 806 string ident = text[identpos..pos]; 807 switch(ident) 808 { 809 case "include": tok.type = Token.PPinclude; break; 810 case "define": tok.type = Token.PPdefine; break; 811 case "undef": tok.type = Token.PPundef; break; 812 case "ifdef": tok.type = Token.PPifdef; break; 813 case "ifndef": tok.type = Token.PPifndef; break; 814 case "if": tok.type = Token.PPif; break; 815 case "elif": tok.type = Token.PPelif; break; 816 case "else": tok.type = Token.PPelse; break; 817 case "endif": tok.type = Token.PPendif; break; 818 default: tok.type = Token.PPother; break; 819 } 820 } 821 } 822 break; 823 824 case '0','1','2','3','4','5','6','7','8','9': 825 skipNumber(); 826 tok.type = Token.Number; 827 break; 828 829 case 'L': 830 if(nextChar() && (text[pos] == '\"' || text[pos] == '\'')) 831 goto case '\"'; 832 skipAlnum(); 833 tok.type = Token.Identifier; 834 break; 835 836 case 'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z': 837 goto case; 838 case 'A','B','C','D','E','F','G','H','I','J','K', 'M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z': 839 goto case; 840 case '_': 841 skipIdent(); 842 string ident = curText; 843 if(ppOnly) 844 tok.type = Token.Identifier; 845 else 846 tok.type = identifierToKeyword(ident); 847 break; 848 849 case '$': 850 nextChar(); 851 skipAlnum(); 852 tok.type = Token.Macro; 853 break; 854 case ';': 855 if (enableASMComment) 856 { 857 skipLine(); 858 tok.type = Token.Comment; 859 } 860 else 861 { 862 tok.type = Token.Semicolon; 863 nextChar(); 864 } 865 break; 866 case '/': 867 nextChar(); 868 tok.type = Token.Div; 869 if(!eof()) 870 { 871 if(text[pos] == '/') 872 { 873 skipLine(); 874 tok.type = Token.Comment; 875 } 876 else if(text[pos] == '*') 877 { 878 skipComment(); 879 tok.type = Token.Comment; 880 } 881 else if(text[pos] == '=') 882 { 883 nextChar(); 884 tok.type = Token.DivAsgn; 885 } 886 } 887 break; 888 889 case '\'': 890 case '\"': 891 skipString(); 892 tok.type = Token.String; 893 break; 894 895 default: 896 tok.type = Token.Other; 897 nextChar(); 898 break; 899 } 900 901 countTokens++; 902 tok.text = curText; 903 return true; 904 } 905 906 string lastIndent; 907 string text; 908 string curText; 909 910 int[] posstack; 911 string[] txtstack; 912 913 uint pos; 914 int lineno; 915 int countTokens; 916 bool lastCharWasNewline; 917 bool skipNewline; 918 bool keepBackSlashAtEOL; 919 bool enableASMComment; 920 921 static bool ppOnly; 922 } 923