Bug 316311 - [C++0x] Raw and unicode string literals

2025-06-08 18:26:01 +02:00 · 2010-08-30 15:55:14 +00:00 · 2010-08-30 15:55:14 +00:00 · 2475fe0010
commit 2475fe0010
parent 9505f9ea9e
3 changed files with 251 additions and 41 deletions
--- a/core/org.eclipse.cdt.core.tests/parser/org/eclipse/cdt/core/parser/tests/scanner/LexerTests.java
+++ b/core/org.eclipse.cdt.core.tests/parser/org/eclipse/cdt/core/parser/tests/scanner/LexerTests.java
@ -18,8 +18,8 @@ import org.eclipse.cdt.core.parser.IToken;
 import org.eclipse.cdt.core.parser.tests.ast2.TestLexerLog;
 import org.eclipse.cdt.core.testplugin.util.BaseTestCase;
 import org.eclipse.cdt.internal.core.parser.scanner.Lexer;
-import org.eclipse.cdt.internal.core.parser.scanner.Token;
 import org.eclipse.cdt.internal.core.parser.scanner.Lexer.LexerOptions;
+import org.eclipse.cdt.internal.core.parser.scanner.Token;


 public class LexerTests extends BaseTestCase {
@ -64,6 +64,12 @@ public class LexerTests extends BaseTestCase {
 		fLastEndOffset= 0;
 	}

+	private void nextDirective() throws Exception {
+		IToken t= fLexer.nextDirective();
+		assertNotNull(t);
+		fLastEndOffset= t.getOffset();
+	}
+	
 	private void token(int tokenType) throws Exception {
 		token(tokenType, null);
 	}
@ -103,10 +109,34 @@ public class LexerTests extends BaseTestCase {
 		token(IToken.tUTF16STRING, "u\"" + expectedImage + "\"");
 	}

+	private void utf8str(String expectedImage) throws Exception {
+		token(IToken.tSTRING, "u8\"" + expectedImage + "\"");
+	}
+
 	private void utf32str(String expectedImage) throws Exception {
 		token(IToken.tUTF32STRING, "U\"" + expectedImage + "\"");
 	}

+	private void rstr(String marker, String expectedImage) throws Exception {
+		token(IToken.tSTRING, "R\"" + marker + '(' + expectedImage + ')' + marker + "\"");
+	}
+
+	private void wrstr(String marker, String expectedImage) throws Exception {
+		token(IToken.tLSTRING, "LR\"" + marker + '(' + expectedImage + ')' + marker + "\"");
+	}
+
+	private void utf16rstr(String marker, String expectedImage) throws Exception {
+		token(IToken.tUTF16STRING, "uR\"" + marker + '(' + expectedImage + ')' + marker + "\"");
+	}
+
+	private void utf8rstr(String marker, String expectedImage) throws Exception {
+		token(IToken.tSTRING, "u8R\"" + marker + '(' + expectedImage + ')' + marker + "\"");
+	}
+
+	private void utf32rstr(String marker, String expectedImage) throws Exception {
+		token(IToken.tUTF32STRING, "UR\"" + marker + '(' + expectedImage + ')' + marker + "\"");
+	}
+
 	private void ch(String expectedImage) throws Exception {
 		token(IToken.tCHAR, expectedImage);
 	}
@ -482,6 +512,10 @@ public class LexerTests extends BaseTestCase {
 		wstr(lit);
 		eof();

+		init("u8\"" + lit + '"');
+		utf8str(lit);
+		eof();
+
 		init("u\"" + lit + '"');
 		utf16str(lit);
 		eof();
@ -540,6 +574,78 @@ public class LexerTests extends BaseTestCase {
 		eof();
 	}

+	public void testRawStringLiteral() throws Exception {
+		String lit= "abc0123\\\"'.:; \\\\ \n\"(";
+		init("R\"(" + lit + ")\"");
+		rstr("", lit);
+		eof();
+
+		init("LR\"(" + lit + ")\"");
+		wrstr("", lit);
+		eof();
+
+		init("u8R\"(" + lit + ")\"");
+		utf8rstr("", lit);
+		eof();
+
+		init("uR\"(" + lit + ")\"");
+		utf16rstr("", lit);
+		eof();
+		
+		init("UR\"(" + lit + ")\"");
+		utf32rstr("", lit);
+		eof();
+
+		init("R\"ut");
+		problem(IProblem.SCANNER_UNBOUNDED_STRING, "R\"ut");
+		token(IToken.tSTRING, "R\"ut");
+		eof();
+
+		init("LR\"(ut");
+		problem(IProblem.SCANNER_UNBOUNDED_STRING, "LR\"(ut");
+		token(IToken.tLSTRING, "LR\"(ut");
+		eof();
+		
+		init("uR\"p()");
+		problem(IProblem.SCANNER_UNBOUNDED_STRING, "uR\"p()");
+		token(IToken.tUTF16STRING, "uR\"p()");
+		eof();
+		
+		init("UR\"(ut");
+		problem(IProblem.SCANNER_UNBOUNDED_STRING, "UR\"(ut");
+		token(IToken.tUTF32STRING, "UR\"(ut");
+		eof();
+		
+		init("R\"+=(Text)=+\"Text)+=\"");
+		rstr("+=", "Text)=+\"Text");
+		eof();
+		
+		init("UR uR LR u8R U8R\"\"");
+		id("UR"); ws();
+		id("uR"); ws();
+		id("LR"); ws();
+		id("u8R"); ws();
+		id("U8R"); str(""); 
+		eof();
+	}
+		
+	public void testRawStringLiteralInInactiveCode() throws Exception {
+		init("start\n" + "inactive: Rbla\n" + "#end");
+		id("start");
+		nextDirective();
+		token(IToken.tPOUND);
+		id("end");
+		eof();
+
+		// raw string containing a directive
+		init("start\n" + "inactive: uR\"(\n#endif\n)\"\n" + "#end");
+		id("start");
+		nextDirective();
+		token(IToken.tPOUND);
+		id("end");
+		eof();
+	}
+
 	public void testOperatorAndPunctuators() throws Exception {
 		final String ops= "{}[]###()<::><%%>%:%:%:;:...?.::..*+-*/%^&|~=!<>+=-=*=/=%=" +
 		"^=&=|=<<>><<=>>===!=<=>=&&||++--,->*-><?>?\\";
@ -563,17 +669,17 @@ public class LexerTests extends BaseTestCase {
 				StringBuffer buf= new StringBuffer();
 				String input= useTrigraphs(ops.toCharArray(), trigraphs);
 				init(instertLineSplices(input, splices)); 
-				for (int i = 0; i < tokens.length; i++) {
+				for (int token2 : tokens) {
 					Token token= fLexer.currentToken();
 					buf.append(token.getCharImage());
-					token(tokens[i]);
+					token(token2);
 				}
 				eof();
 				assertEquals(ops, buf.toString()); // check token image

 				init(input, NO_MINMAX); 
-				for (int i = 0; i < tokens.length; i++) {
-					switch (tokens[i]) {
+				for (int token : tokens) {
+					switch (token) {
 					case IGCCToken.tMIN:
 						token(IToken.tLT);
 						token(IToken.tQUESTION);
@ -583,7 +689,7 @@ public class LexerTests extends BaseTestCase {
 						token(IToken.tQUESTION);
 						break;
 					default:
-						token(tokens[i]);
+						token(token);
 					break;
 					}
 				}
@ -630,8 +736,7 @@ public class LexerTests extends BaseTestCase {

 		boolean yes= mode > 1;
 		StringBuffer result= new StringBuffer();
-		for (int i = 0; i < input.length; i++) {
-			char c = input[i];
+		for (char c : input) {
 			int idx= TRIGRAPH_REPLACES_CHARS.indexOf(c);
 			if (idx > 0) {
 				if (yes) {
--- a/core/org.eclipse.cdt.core/parser/org/eclipse/cdt/internal/core/dom/parser/cpp/CPPASTLiteralExpression.java
+++ b/core/org.eclipse.cdt.core/parser/org/eclipse/cdt/internal/core/dom/parser/cpp/CPPASTLiteralExpression.java
@ -121,8 +121,26 @@ public class CPPASTLiteralExpression extends ASTNode implements ICPPASTLiteralEx
 	private IValue getStringLiteralSize() {
 		char[] value= getValue();
 		int length= value.length-1;
-		if (value[0] != '"') {
-			length--;
+		boolean isRaw= false;
+		for (int i = 0; i < length; i++) {
+			final char c = value[i];
+			if (c == '"') {
+				if (isRaw) {
+					for (int j=i+1; j<length; j++) {
+						final char d= value[j];
+						if (d == '(') {
+							length -= 2*(j-i);
+							break;
+						}
+					}
+				}
+				length -= i;
+				if (length < 0)
+					length= 0;
+				break;
+			} else if (c == 'R') {
+				isRaw = true;
+			}
 		}
 		return Value.create(length);
 	}
--- a/core/org.eclipse.cdt.core/parser/org/eclipse/cdt/internal/core/parser/scanner/Lexer.java
+++ b/core/org.eclipse.cdt.core/parser/org/eclipse/cdt/internal/core/parser/scanner/Lexer.java
@ -183,7 +183,6 @@ final public class Lexer implements ITokenSequence {
 	 * @param origin parameter for the {@link OffsetLimitReachedException} when it has to be thrown.
 	 * @since 5.0
 	 */
-	@SuppressWarnings("fallthrough")
 	public final int consumeLine(int origin) throws OffsetLimitReachedException {
 		Token t= fToken;
 		Token lt= null;
@ -200,7 +199,7 @@ final public class Lexer implements ITokenSequence {
 					t.setType(IToken.tCOMPLETION);
 					throw new OffsetLimitReachedException(origin, t);
 				}
-				// no break;
+				//$FALL-THROUGH$
 			case Lexer.tNEWLINE:
 				fToken= t;
 				if (lt != null) {
@ -233,8 +232,7 @@ final public class Lexer implements ITokenSequence {
 			final int pos= fEndOffset;
 			if (!isValidOffset(pos+1)) {
 				d= nextCharPhase3();
-			}
-			else {
+			} else {
 				d= fInput.get(pos);
 				switch(d) {
 				case '\\': 
@ -272,8 +270,15 @@ final public class Lexer implements ITokenSequence {
 				haveNL= hadNL;
 				continue;
 				
+			case 'R':
+				if (d == '"') {
+					nextCharPhase3();
+					rawStringLiteral(start, 2, IToken.tSTRING);
+				}
+				continue;
+				
 			case '"':
-				stringLiteral(start, IToken.tSTRING);
+				stringLiteral(start, 1, IToken.tSTRING);
 				continue;

 			case '\'':
@ -356,9 +361,17 @@ final public class Lexer implements ITokenSequence {

 			case 'L':
 				switch(d) {
+				case 'R':
+					markPhase3();
+					if (nextCharPhase3() == '"') {
+						nextCharPhase3();
+						return rawStringLiteral(start, 3, IToken.tLSTRING);
+					}
+					restorePhase3();
+					break;
 				case '"':
 					nextCharPhase3();
-					return stringLiteral(start, IToken.tLSTRING);
+					return stringLiteral(start, 2, IToken.tLSTRING);
 				case '\'':
 					nextCharPhase3();
 					return charLiteral(start, IToken.tLCHAR);
@ -368,14 +381,46 @@ final public class Lexer implements ITokenSequence {
 			case 'u': 	
 			case 'U':
 				if (fOptions.fSupportUTFLiterals) {
-					if (d == '"') {
+					switch(d) {
+					case 'R':
+						markPhase3();
+						if (nextCharPhase3() == '"') {
 							nextCharPhase3();
-						return stringLiteral(start, c == 'u' ? IToken.tUTF16STRING : IToken.tUTF32STRING);
+							return rawStringLiteral(start, 3, c == 'u' ? IToken.tUTF16STRING : IToken.tUTF32STRING);
 						}
-					if (d == '\'') {
+						restorePhase3();
+						break;
+					case '"':
+						nextCharPhase3();
+						return stringLiteral(start, 2, c == 'u' ? IToken.tUTF16STRING : IToken.tUTF32STRING);
+					case '\'':
 						nextCharPhase3();
 						return charLiteral(start, c == 'u' ? IToken.tUTF16CHAR : IToken.tUTF32CHAR);
+					case '8':
+						if (c == 'u') {
+							markPhase3();
+							switch (nextCharPhase3()) {
+							case 'R':
+								if (nextCharPhase3() == '"') {
+									nextCharPhase3();
+									return rawStringLiteral(start, 4, IToken.tSTRING);
 								}
+								break;
+							case '"':
+								nextCharPhase3();
+								return stringLiteral(start, 3, IToken.tSTRING);
+							}
+							restorePhase3();
+						}
+						break;
+					}
+				}
+				return identifier(start, 1);
+				
+			case 'R':
+				if (d == '"') {
+					nextCharPhase3();
+					return rawStringLiteral(start, 2, IToken.tSTRING);
 				}
 				return identifier(start, 1);
 				
@ -383,7 +428,7 @@ final public class Lexer implements ITokenSequence {
 				if (fInsideIncludeDirective) {
 					return headerName(start, true);
 				}
-				return stringLiteral(start, IToken.tSTRING);
+				return stringLiteral(start, 1, IToken.tSTRING);

 			case '\'':
 				return charLiteral(start, IToken.tCHAR);
@ -392,7 +437,7 @@ final public class Lexer implements ITokenSequence {
 			case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 
 			case 's': case 't':           case 'v': case 'w': case 'x': case 'y': case 'z':
 			case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I':
-			case 'J': case 'K':           case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 
+			case 'J': case 'K':           case 'M': case 'N': case 'O': case 'P': case 'Q':  
 			case 'S': case 'T':           case 'V': case 'W': case 'X': case 'Y': case 'Z':
 			case '_':
 				return identifier(start, 1);
@ -697,7 +742,6 @@ final public class Lexer implements ITokenSequence {
    	fLog.handleProblem(problemID, arg, offset, fOffset);
    }

-    @SuppressWarnings("fallthrough")
 	private Token headerName(final int start, final boolean expectQuotes) throws OffsetLimitReachedException {
    	int length= 1;
 		boolean done = false;
@ -709,7 +753,7 @@ final public class Lexer implements ITokenSequence {
 					throw new OffsetLimitReachedException(ORIGIN_LEXER, 
 							newToken((expectQuotes ? tQUOTE_HEADER_NAME : tSYSTEM_HEADER_NAME), start, length));
 				}
-				// no break;
+				//$FALL-THROUGH$
 			case '\n':
 				handleProblem(IProblem.SCANNER_UNBOUNDED_STRING, getInputChars(start, fOffset), start);
 				break loop;
@ -758,12 +802,10 @@ final public class Lexer implements ITokenSequence {
 		}
 	}

-	@SuppressWarnings("fallthrough")
-	private Token stringLiteral(final int start, final int tokenType) throws OffsetLimitReachedException {
+	private Token stringLiteral(final int start, int length, final int tokenType) throws OffsetLimitReachedException {
 		boolean escaped = false;
 		boolean done = false;
 		
-		int length = tokenType == IToken.tSTRING ? 1 : 2;
 		int c= fCharPhase3;
 		
 		loop: while (!done) {
@ -772,7 +814,7 @@ final public class Lexer implements ITokenSequence {
 				if (fSupportContentAssist) {
 					throw new OffsetLimitReachedException(ORIGIN_LEXER, newToken(tokenType, start, length));
 				}
-				// no break;
+				//$FALL-THROUGH$
 			case '\n':
 				handleProblem(IProblem.SCANNER_UNBOUNDED_STRING, getInputChars(start, fOffset), start);
 				break loop;
@ -796,7 +838,53 @@ final public class Lexer implements ITokenSequence {
 		return newToken(tokenType, start, length);
 	}

-	@SuppressWarnings("fallthrough")
+	private Token rawStringLiteral(final int start, int length, final int tokenType) throws OffsetLimitReachedException {
+		final int delimOffset= fOffset;
+		int delimEndOffset = delimOffset;
+		int offset;
+		for(;; delimEndOffset++) {
+			if (!fInput.isValidOffset(delimEndOffset)) {
+				offset= delimEndOffset;
+				break;
+			}
+			if (fInput.get(delimEndOffset) == '(') {
+				offset= delimEndOffset+1;
+				break;
+			}
+		}
+		
+		final int delimLength= delimEndOffset-delimOffset;
+		for(;; offset++) {
+			if (!fInput.isValidOffset(offset)) {
+				handleProblem(IProblem.SCANNER_UNBOUNDED_STRING, getInputChars(start, offset), start);
+				break;
+			} 
+				
+			final char c= fInput.get(offset);
+			if (c == ')') {
+				final int endingDoubleQuoteOffset= offset+delimLength+1;
+				if (fInput.isValidOffset(endingDoubleQuoteOffset) && fInput.get(endingDoubleQuoteOffset) == '"') {
+					boolean prefixMatches= true;
+					for (int i = 0; i < delimLength; i++) {
+						if (fInput.get(offset + i + 1) != fInput.get(delimOffset+i)) {
+							prefixMatches= false;
+							break;
+						}
+					}
+					if (prefixMatches) {
+						offset= endingDoubleQuoteOffset+1;
+						break;
+					}
+				}
+			}
+		}
+		fOffset= offset-1;
+		fEndOffset= offset;
+		fCharPhase3=  0;
+		nextCharPhase3();
+		return newToken(tokenType, start, offset-start);
+	}
+
 	private Token charLiteral(final int start, final int tokenType) throws OffsetLimitReachedException {
 		boolean escaped = false;
 		boolean done = false;
@ -809,7 +897,7 @@ final public class Lexer implements ITokenSequence {
 				if (fSupportContentAssist) {
 					throw new OffsetLimitReachedException(ORIGIN_LEXER, newToken(tokenType, start, length));
 				}
-				// no break;
+				//$FALL-THROUGH$
 			case '\n':
 				handleProblem(IProblem.SCANNER_BAD_CHARACTER, getInputChars(start, fOffset), start);
 				break loop;
@ -990,7 +1078,8 @@ final public class Lexer implements ITokenSequence {
 	
 	
 	/**
-	 * Saves the current state of phase3, necessary for '...', '%:%:' and UNCs.
+	 * Saves the current state of phase3, necessary for '...', '%:%:', UNCs and string literals
+	 * with a long prefix.
 	 */
 	private void markPhase3() {
 		fMarkOffset= fOffset;
@ -1009,9 +1098,8 @@ final public class Lexer implements ITokenSequence {
 	
 	/**
 	 * Perform phase 1-3: Replace \r\n with \n, handle trigraphs, detect line-splicing.
-	 * Changes fOffset, fEndOffset and fCharPhase3, stateless otherwise.
+	 * Changes fOffset, fEndOffset and fCharPhase3, state-less otherwise.
 	 */
-	@SuppressWarnings("fallthrough")
 	private int nextCharPhase3() {
 		int pos= fEndOffset;
 		do {
@ -1057,7 +1145,7 @@ final public class Lexer implements ITokenSequence {
 					return trigraph;
 				}
 				pos+= 2;
-				// no break, handle backslash
+				// $FALL-THROUGH$, handle backslash

 			case '\\':
 				final int lsPos= findEndOfLineSpliceSequence(pos);
@ -1097,9 +1185,8 @@ final public class Lexer implements ITokenSequence {
 	}

 	/**
-	 * Returns the endoffset for a line-splice sequence, or -1 if there is none.
+	 * Returns the end-offset for a line-splice sequence, or -1 if there is none.
 	 */
-	@SuppressWarnings("fallthrough")
 	private int findEndOfLineSpliceSequence(int pos) {
 		boolean haveBackslash= true;
 		int result= -1;
@ -1123,7 +1210,7 @@ final public class Lexer implements ITokenSequence {
 				if (!isValidOffset(pos+1) || fInput.get(pos) != '?' || fInput.get(++pos) != '/') {
 					return result;
 				}
-				// fall through to backslash handling
+				// $FALL-THROUGH$ to backslash handling
 					
 			case '\\':
 				if (!haveBackslash) {