001//////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code for adherence to a set of rules. 003// Copyright (C) 2001-2020 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018//////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks; 021 022import java.util.List; 023import java.util.Map; 024import java.util.regex.Matcher; 025import java.util.regex.Pattern; 026 027import com.puppycrawl.tools.checkstyle.FileStatefulCheck; 028import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 029import com.puppycrawl.tools.checkstyle.api.DetailAST; 030import com.puppycrawl.tools.checkstyle.api.TextBlock; 031import com.puppycrawl.tools.checkstyle.api.TokenTypes; 032import com.puppycrawl.tools.checkstyle.utils.CommonUtil; 033 034/** 035 * <p> 036 * Restricts using 037 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3"> 038 * Unicode escapes</a> 039 * (such as \u221e). It is possible to allow using escapes for 040 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 041 * non-printable, control characters</a>. 042 * Also, this check can be configured to allow using escapes 043 * if trail comment is present. By the option it is possible to 044 * allow using escapes if literal contains only them. 045 * </p> 046 * <ul> 047 * <li> 048 * Property {@code allowEscapesForControlCharacters} - Allow use escapes for 049 * non-printable, control characters. 050 * Type is {@code boolean}. 051 * Default value is {@code false}. 052 * </li> 053 * <li> 054 * Property {@code allowByTailComment} - Allow use escapes if trail comment is present. 055 * Type is {@code boolean}. 056 * Default value is {@code false}. 057 * </li> 058 * <li> 059 * Property {@code allowIfAllCharactersEscaped} - Allow if all characters in literal are escaped. 060 * Type is {@code boolean}. 061 * Default value is {@code false}. 062 * </li> 063 * <li> 064 * Property {@code allowNonPrintableEscapes} - Allow use escapes for 065 * non-printable, whitespace characters. 066 * Type is {@code boolean}. 067 * Default value is {@code false}. 068 * </li> 069 * </ul> 070 * <p> 071 * Examples of using Unicode:</p> 072 * <pre> 073 * String unitAbbrev = "μs"; // Best: perfectly clear even without a comment. 074 * String unitAbbrev = "\u03bcs"; // Poor: the reader has no idea what this is. 075 * </pre> 076 * <p> 077 * An example of how to configure the check is: 078 * </p> 079 * <pre> 080 * <module name="AvoidEscapedUnicodeCharacters"/> 081 * </pre> 082 * <p> 083 * An example of non-printable, control characters. 084 * </p> 085 * <pre> 086 * return '\ufeff' + content; // byte order mark 087 * </pre> 088 * <p> 089 * An example of how to configure the check to allow using escapes 090 * for non-printable, control characters: 091 * </p> 092 * <pre> 093 * <module name="AvoidEscapedUnicodeCharacters"> 094 * <property name="allowEscapesForControlCharacters" value="true"/> 095 * </module> 096 * </pre> 097 * <p> 098 * Example of using escapes with trail comment: 099 * </p> 100 * <pre> 101 * String unitAbbrev = "\u03bcs"; // Greek letter mu, "s" 102 * </pre> 103 * <p>An example of how to configure the check to allow using escapes 104 * if trail comment is present: 105 * </p> 106 * <pre> 107 * <module name="AvoidEscapedUnicodeCharacters"> 108 * <property name="allowByTailComment" value="true"/> 109 * </module> 110 * </pre> 111 * <p>Example of using escapes if literal contains only them: 112 * </p> 113 * <pre> 114 * String unitAbbrev = "\u03bc\u03bc\u03bc"; 115 * </pre> 116 * <p>An example of how to configure the check to allow escapes 117 * if literal contains only them: 118 * </p> 119 * <pre> 120 * <module name="AvoidEscapedUnicodeCharacters"> 121 * <property name="allowIfAllCharactersEscaped" value="true"/> 122 * </module> 123 * </pre> 124 * <p>An example of how to configure the check to allow using escapes 125 * for non-printable, whitespace characters: 126 * </p> 127 * <pre> 128 * <module name="AvoidEscapedUnicodeCharacters"> 129 * <property name="allowNonPrintableEscapes" value="true"/> 130 * </module> 131 * </pre> 132 * <p> 133 * Parent is {@code com.puppycrawl.tools.checkstyle.TreeWalker} 134 * </p> 135 * <p> 136 * Violation Message Keys: 137 * </p> 138 * <ul> 139 * <li> 140 * {@code forbid.escaped.unicode.char} 141 * </li> 142 * </ul> 143 * 144 * @since 5.8 145 */ 146@FileStatefulCheck 147public class AvoidEscapedUnicodeCharactersCheck 148 extends AbstractCheck { 149 150 /** 151 * A key is pointing to the warning message text in "messages.properties" 152 * file. 153 */ 154 public static final String MSG_KEY = "forbid.escaped.unicode.char"; 155 156 /** Regular expression for Unicode chars. */ 157 private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}"); 158 159 /** 160 * Regular expression Unicode control characters. 161 * 162 * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 163 * Appendix:Control characters</a> 164 */ 165 private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\[uU]" 166 + "(00[0-1][0-9A-Fa-f]" 167 + "|00[8-9][0-9A-Fa-f]" 168 + "|00[aA][dD]" 169 + "|034[fF]" 170 + "|070[fF]" 171 + "|180[eE]" 172 + "|200[b-fB-F]" 173 + "|202[a-eA-E]" 174 + "|206[0-4a-fA-F]" 175 + "|[fF]{3}[9a-bA-B]" 176 + "|[fF][eE][fF]{2})"); 177 178 /** Regular expression for all escaped chars. */ 179 private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}" 180 + "|\"" 181 + "|'" 182 + "|\\\\" 183 + "|\\\\b" 184 + "|\\\\f" 185 + "|\\\\n" 186 + "|\\\\r" 187 + "|\\\\t" 188 + ")+$"); 189 190 /** Regular expression for escaped backslash. */ 191 private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\"); 192 193 /** Regular expression for non-printable unicode chars. */ 194 private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000" 195 + "|\\\\u0009" 196 + "|\\\\u000[bB]" 197 + "|\\\\u000[cC]" 198 + "|\\\\u0020" 199 + "|\\\\u007[fF]" 200 + "|\\\\u0085" 201 + "|\\\\u009[fF]" 202 + "|\\\\u00[aA]0" 203 + "|\\\\u00[aA][dD]" 204 + "|\\\\u04[fF]9" 205 + "|\\\\u05[bB][eE]" 206 + "|\\\\u05[dD]0" 207 + "|\\\\u05[eE][aA]" 208 + "|\\\\u05[fF]3" 209 + "|\\\\u05[fF]4" 210 + "|\\\\u0600" 211 + "|\\\\u0604" 212 + "|\\\\u061[cC]" 213 + "|\\\\u06[dD]{2}" 214 + "|\\\\u06[fF]{2}" 215 + "|\\\\u070[fF]" 216 + "|\\\\u0750" 217 + "|\\\\u077[fF]" 218 + "|\\\\u0[eE]00" 219 + "|\\\\u0[eE]7[fF]" 220 + "|\\\\u1680" 221 + "|\\\\u180[eE]" 222 + "|\\\\u1[eE]00" 223 + "|\\\\u2000" 224 + "|\\\\u2001" 225 + "|\\\\u2002" 226 + "|\\\\u2003" 227 + "|\\\\u2004" 228 + "|\\\\u2005" 229 + "|\\\\u2006" 230 + "|\\\\u2007" 231 + "|\\\\u2008" 232 + "|\\\\u2009" 233 + "|\\\\u200[aA]" 234 + "|\\\\u200[fF]" 235 + "|\\\\u2025" 236 + "|\\\\u2028" 237 + "|\\\\u2029" 238 + "|\\\\u202[fF]" 239 + "|\\\\u205[fF]" 240 + "|\\\\u2064" 241 + "|\\\\u2066" 242 + "|\\\\u2067" 243 + "|\\\\u2068" 244 + "|\\\\u2069" 245 + "|\\\\u206[aA]" 246 + "|\\\\u206[fF]" 247 + "|\\\\u20[aA][fF]" 248 + "|\\\\u2100" 249 + "|\\\\u213[aA]" 250 + "|\\\\u3000" 251 + "|\\\\u[dD]800" 252 + "|\\\\u[fF]8[fF]{2}" 253 + "|\\\\u[fF][bB]50" 254 + "|\\\\u[fF][dD][fF]{2}" 255 + "|\\\\u[fF][eE]70" 256 + "|\\\\u[fF][eE][fF]{2}" 257 + "|\\\\u[fF]{2}0[eE]" 258 + "|\\\\u[fF]{2}61" 259 + "|\\\\u[fF]{2}[dD][cC]" 260 + "|\\\\u[fF]{3}9" 261 + "|\\\\u[fF]{3}[aA]" 262 + "|\\\\u[fF]{3}[bB]" 263 + "|\\\\u[fF]{4}"); 264 265 /** Cpp style comments. */ 266 private Map<Integer, TextBlock> singlelineComments; 267 /** C style comments. */ 268 private Map<Integer, List<TextBlock>> blockComments; 269 270 /** Allow use escapes for non-printable, control characters. */ 271 private boolean allowEscapesForControlCharacters; 272 273 /** Allow use escapes if trail comment is present. */ 274 private boolean allowByTailComment; 275 276 /** Allow if all characters in literal are escaped. */ 277 private boolean allowIfAllCharactersEscaped; 278 279 /** Allow use escapes for non-printable, whitespace characters. */ 280 private boolean allowNonPrintableEscapes; 281 282 /** 283 * Setter to allow use escapes for non-printable, control characters. 284 * 285 * @param allow user's value. 286 */ 287 public final void setAllowEscapesForControlCharacters(boolean allow) { 288 allowEscapesForControlCharacters = allow; 289 } 290 291 /** 292 * Setter to allow use escapes if trail comment is present. 293 * 294 * @param allow user's value. 295 */ 296 public final void setAllowByTailComment(boolean allow) { 297 allowByTailComment = allow; 298 } 299 300 /** 301 * Setter to allow if all characters in literal are escaped. 302 * 303 * @param allow user's value. 304 */ 305 public final void setAllowIfAllCharactersEscaped(boolean allow) { 306 allowIfAllCharactersEscaped = allow; 307 } 308 309 /** 310 * Setter to allow use escapes for non-printable, whitespace characters. 311 * 312 * @param allow user's value. 313 */ 314 public final void setAllowNonPrintableEscapes(boolean allow) { 315 allowNonPrintableEscapes = allow; 316 } 317 318 @Override 319 public int[] getDefaultTokens() { 320 return getRequiredTokens(); 321 } 322 323 @Override 324 public int[] getAcceptableTokens() { 325 return getRequiredTokens(); 326 } 327 328 @Override 329 public int[] getRequiredTokens() { 330 return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL}; 331 } 332 333 @Override 334 public void beginTree(DetailAST rootAST) { 335 singlelineComments = getFileContents().getSingleLineComments(); 336 blockComments = getFileContents().getBlockComments(); 337 } 338 339 @Override 340 public void visitToken(DetailAST ast) { 341 final String literal = ast.getText(); 342 343 if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast) 344 || isAllCharactersEscaped(literal) 345 || allowEscapesForControlCharacters 346 && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) 347 || allowNonPrintableEscapes 348 && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { 349 log(ast, MSG_KEY); 350 } 351 } 352 353 /** 354 * Checks if literal has Unicode chars. 355 * 356 * @param literal String literal. 357 * @return true if literal has Unicode chars. 358 */ 359 private static boolean hasUnicodeChar(String literal) { 360 final String literalWithoutEscapedBackslashes = 361 ESCAPED_BACKSLASH.matcher(literal).replaceAll(""); 362 return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find(); 363 } 364 365 /** 366 * Check if String literal contains Unicode control chars. 367 * 368 * @param literal String literal. 369 * @param pattern RegExp for valid characters. 370 * @return true, if String literal contains Unicode control chars. 371 */ 372 private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { 373 final int unicodeMatchesCounter = 374 countMatches(UNICODE_REGEXP, literal); 375 final int unicodeValidMatchesCounter = 376 countMatches(pattern, literal); 377 return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; 378 } 379 380 /** 381 * Check if trail comment is present after ast token. 382 * 383 * @param ast current token. 384 * @return true if trail comment is present after ast token. 385 */ 386 private boolean hasTrailComment(DetailAST ast) { 387 boolean result = false; 388 final int lineNo = ast.getLineNo(); 389 if (singlelineComments.containsKey(lineNo)) { 390 result = true; 391 } 392 else { 393 final List<TextBlock> commentList = blockComments.get(lineNo); 394 if (commentList != null) { 395 final TextBlock comment = commentList.get(commentList.size() - 1); 396 final String line = getLines()[lineNo - 1]; 397 result = isTrailingBlockComment(comment, line); 398 } 399 } 400 return result; 401 } 402 403 /** 404 * Whether the C style comment is trailing. 405 * 406 * @param comment the comment to check. 407 * @param line the line where the comment starts. 408 * @return true if the comment is trailing. 409 */ 410 private static boolean isTrailingBlockComment(TextBlock comment, String line) { 411 return comment.getText().length != 1 412 || CommonUtil.isBlank(line.substring(comment.getEndColNo() + 1)); 413 } 414 415 /** 416 * Count regexp matches into String literal. 417 * 418 * @param pattern pattern. 419 * @param target String literal. 420 * @return count of regexp matches. 421 */ 422 private static int countMatches(Pattern pattern, String target) { 423 int matcherCounter = 0; 424 final Matcher matcher = pattern.matcher(target); 425 while (matcher.find()) { 426 matcherCounter++; 427 } 428 return matcherCounter; 429 } 430 431 /** 432 * Checks if all characters in String literal is escaped. 433 * 434 * @param literal current literal. 435 * @return true if all characters in String literal is escaped. 436 */ 437 private boolean isAllCharactersEscaped(String literal) { 438 return allowIfAllCharactersEscaped 439 && ALL_ESCAPED_CHARS.matcher(literal.substring(1, 440 literal.length() - 1)).find(); 441 } 442 443}