001//////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code for adherence to a set of rules. 003// Copyright (C) 2001-2020 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018//////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks; 021 022import java.util.List; 023import java.util.Map; 024import java.util.regex.Matcher; 025import java.util.regex.Pattern; 026 027import com.puppycrawl.tools.checkstyle.FileStatefulCheck; 028import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 029import com.puppycrawl.tools.checkstyle.api.DetailAST; 030import com.puppycrawl.tools.checkstyle.api.TextBlock; 031import com.puppycrawl.tools.checkstyle.api.TokenTypes; 032import com.puppycrawl.tools.checkstyle.utils.CommonUtil; 033 034/** 035 * <p> 036 * Restricts using 037 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3"> 038 * Unicode escapes</a> 039 * (such as \u221e). It is possible to allow using escapes for 040 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 041 * non-printable, control characters</a>. 042 * Also, this check can be configured to allow using escapes 043 * if trail comment is present. By the option it is possible to 044 * allow using escapes if literal contains only them. 045 * </p> 046 * <ul> 047 * <li> 048 * Property {@code allowEscapesForControlCharacters} - Allow use escapes for 049 * non-printable, control characters. 050 * Default value is {@code false}. 051 * </li> 052 * <li> 053 * Property {@code allowByTailComment} - Allow use escapes if trail comment is present. 054 * Default value is {@code false}. 055 * </li> 056 * <li> 057 * Property {@code allowIfAllCharactersEscaped} - Allow if all characters in literal are escaped. 058 * Default value is {@code false}. 059 * </li> 060 * <li> 061 * Property {@code allowNonPrintableEscapes} - Allow use escapes for 062 * non-printable, whitespace characters. 063 * Default value is {@code false}. 064 * </li> 065 * </ul> 066 * <p> 067 * Examples of using Unicode:</p> 068 * <pre> 069 * String unitAbbrev = "μs"; // Best: perfectly clear even without a comment. 070 * String unitAbbrev = "\u03bcs"; // Poor: the reader has no idea what this is. 071 * </pre> 072 * <p> 073 * An example of how to configure the check is: 074 * </p> 075 * <pre> 076 * <module name="AvoidEscapedUnicodeCharacters"/> 077 * </pre> 078 * <p> 079 * An example of non-printable, control characters. 080 * </p> 081 * <pre> 082 * return '\ufeff' + content; // byte order mark 083 * </pre> 084 * <p> 085 * An example of how to configure the check to allow using escapes 086 * for non-printable, control characters: 087 * </p> 088 * <pre> 089 * <module name="AvoidEscapedUnicodeCharacters"> 090 * <property name="allowEscapesForControlCharacters" value="true"/> 091 * </module> 092 * </pre> 093 * <p> 094 * Example of using escapes with trail comment: 095 * </p> 096 * <pre> 097 * String unitAbbrev = "\u03bcs"; // Greek letter mu, "s" 098 * </pre> 099 * <p>An example of how to configure the check to allow using escapes 100 * if trail comment is present: 101 * </p> 102 * <pre> 103 * <module name="AvoidEscapedUnicodeCharacters"> 104 * <property name="allowByTailComment" value="true"/> 105 * </module> 106 * </pre> 107 * <p>Example of using escapes if literal contains only them: 108 * </p> 109 * <pre> 110 * String unitAbbrev = "\u03bc\u03bc\u03bc"; 111 * </pre> 112 * <p>An example of how to configure the check to allow escapes 113 * if literal contains only them: 114 * </p> 115 * <pre> 116 * <module name="AvoidEscapedUnicodeCharacters"> 117 * <property name="allowIfAllCharactersEscaped" value="true"/> 118 * </module> 119 * </pre> 120 * <p>An example of how to configure the check to allow using escapes 121 * for non-printable, whitespace characters: 122 * </p> 123 * <pre> 124 * <module name="AvoidEscapedUnicodeCharacters"> 125 * <property name="allowNonPrintableEscapes" value="true"/> 126 * </module> 127 * </pre> 128 * 129 * @since 5.8 130 */ 131@FileStatefulCheck 132public class AvoidEscapedUnicodeCharactersCheck 133 extends AbstractCheck { 134 135 /** 136 * A key is pointing to the warning message text in "messages.properties" 137 * file. 138 */ 139 public static final String MSG_KEY = "forbid.escaped.unicode.char"; 140 141 /** Regular expression for Unicode chars. */ 142 private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}"); 143 144 /** 145 * Regular expression Unicode control characters. 146 * 147 * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 148 * Appendix:Control characters</a> 149 */ 150 private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\[uU]" 151 + "(00[0-1][0-9A-Fa-f]" 152 + "|00[8-9][0-9A-Fa-f]" 153 + "|00[aA][dD]" 154 + "|034[fF]" 155 + "|070[fF]" 156 + "|180[eE]" 157 + "|200[b-fB-F]" 158 + "|202[a-eA-E]" 159 + "|206[0-4a-fA-F]" 160 + "|[fF]{3}[9a-bA-B]" 161 + "|[fF][eE][fF]{2})"); 162 163 /** Regular expression for all escaped chars. */ 164 private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}" 165 + "|\"" 166 + "|'" 167 + "|\\\\" 168 + "|\\\\b" 169 + "|\\\\f" 170 + "|\\\\n" 171 + "|\\\\r" 172 + "|\\\\t" 173 + ")+$"); 174 175 /** Regular expression for escaped backslash. */ 176 private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\"); 177 178 /** Regular expression for non-printable unicode chars. */ 179 private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000" 180 + "|\\\\u0009" 181 + "|\\\\u000[bB]" 182 + "|\\\\u000[cC]" 183 + "|\\\\u0020" 184 + "|\\\\u007[fF]" 185 + "|\\\\u0085" 186 + "|\\\\u009[fF]" 187 + "|\\\\u00[aA]0" 188 + "|\\\\u00[aA][dD]" 189 + "|\\\\u04[fF]9" 190 + "|\\\\u05[bB][eE]" 191 + "|\\\\u05[dD]0" 192 + "|\\\\u05[eE][aA]" 193 + "|\\\\u05[fF]3" 194 + "|\\\\u05[fF]4" 195 + "|\\\\u0600" 196 + "|\\\\u0604" 197 + "|\\\\u061[cC]" 198 + "|\\\\u06[dD]{2}" 199 + "|\\\\u06[fF]{2}" 200 + "|\\\\u070[fF]" 201 + "|\\\\u0750" 202 + "|\\\\u077[fF]" 203 + "|\\\\u0[eE]00" 204 + "|\\\\u0[eE]7[fF]" 205 + "|\\\\u1680" 206 + "|\\\\u180[eE]" 207 + "|\\\\u1[eE]00" 208 + "|\\\\u2000" 209 + "|\\\\u2001" 210 + "|\\\\u2002" 211 + "|\\\\u2003" 212 + "|\\\\u2004" 213 + "|\\\\u2005" 214 + "|\\\\u2006" 215 + "|\\\\u2007" 216 + "|\\\\u2008" 217 + "|\\\\u2009" 218 + "|\\\\u200[aA]" 219 + "|\\\\u200[fF]" 220 + "|\\\\u2025" 221 + "|\\\\u2028" 222 + "|\\\\u2029" 223 + "|\\\\u202[fF]" 224 + "|\\\\u205[fF]" 225 + "|\\\\u2064" 226 + "|\\\\u2066" 227 + "|\\\\u2067" 228 + "|\\\\u2068" 229 + "|\\\\u2069" 230 + "|\\\\u206[aA]" 231 + "|\\\\u206[fF]" 232 + "|\\\\u20[aA][fF]" 233 + "|\\\\u2100" 234 + "|\\\\u213[aA]" 235 + "|\\\\u3000" 236 + "|\\\\u[dD]800" 237 + "|\\\\u[fF]8[fF]{2}" 238 + "|\\\\u[fF][bB]50" 239 + "|\\\\u[fF][dD][fF]{2}" 240 + "|\\\\u[fF][eE]70" 241 + "|\\\\u[fF][eE][fF]{2}" 242 + "|\\\\u[fF]{2}0[eE]" 243 + "|\\\\u[fF]{2}61" 244 + "|\\\\u[fF]{2}[dD][cC]" 245 + "|\\\\u[fF]{3}9" 246 + "|\\\\u[fF]{3}[aA]" 247 + "|\\\\u[fF]{3}[bB]" 248 + "|\\\\u[fF]{4}"); 249 250 /** Cpp style comments. */ 251 private Map<Integer, TextBlock> singlelineComments; 252 /** C style comments. */ 253 private Map<Integer, List<TextBlock>> blockComments; 254 255 /** Allow use escapes for non-printable, control characters. */ 256 private boolean allowEscapesForControlCharacters; 257 258 /** Allow use escapes if trail comment is present. */ 259 private boolean allowByTailComment; 260 261 /** Allow if all characters in literal are escaped. */ 262 private boolean allowIfAllCharactersEscaped; 263 264 /** Allow use escapes for non-printable, whitespace characters. */ 265 private boolean allowNonPrintableEscapes; 266 267 /** 268 * Setter to allow use escapes for non-printable, control characters. 269 * 270 * @param allow user's value. 271 */ 272 public final void setAllowEscapesForControlCharacters(boolean allow) { 273 allowEscapesForControlCharacters = allow; 274 } 275 276 /** 277 * Setter to allow use escapes if trail comment is present. 278 * 279 * @param allow user's value. 280 */ 281 public final void setAllowByTailComment(boolean allow) { 282 allowByTailComment = allow; 283 } 284 285 /** 286 * Setter to allow if all characters in literal are escaped. 287 * 288 * @param allow user's value. 289 */ 290 public final void setAllowIfAllCharactersEscaped(boolean allow) { 291 allowIfAllCharactersEscaped = allow; 292 } 293 294 /** 295 * Setter to allow use escapes for non-printable, whitespace characters. 296 * 297 * @param allow user's value. 298 */ 299 public final void setAllowNonPrintableEscapes(boolean allow) { 300 allowNonPrintableEscapes = allow; 301 } 302 303 @Override 304 public int[] getDefaultTokens() { 305 return getRequiredTokens(); 306 } 307 308 @Override 309 public int[] getAcceptableTokens() { 310 return getRequiredTokens(); 311 } 312 313 @Override 314 public int[] getRequiredTokens() { 315 return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL}; 316 } 317 318 @Override 319 public void beginTree(DetailAST rootAST) { 320 singlelineComments = getFileContents().getSingleLineComments(); 321 blockComments = getFileContents().getBlockComments(); 322 } 323 324 @Override 325 public void visitToken(DetailAST ast) { 326 final String literal = ast.getText(); 327 328 if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast) 329 || isAllCharactersEscaped(literal) 330 || allowEscapesForControlCharacters 331 && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) 332 || allowNonPrintableEscapes 333 && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { 334 log(ast, MSG_KEY); 335 } 336 } 337 338 /** 339 * Checks if literal has Unicode chars. 340 * 341 * @param literal String literal. 342 * @return true if literal has Unicode chars. 343 */ 344 private static boolean hasUnicodeChar(String literal) { 345 final String literalWithoutEscapedBackslashes = 346 ESCAPED_BACKSLASH.matcher(literal).replaceAll(""); 347 return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find(); 348 } 349 350 /** 351 * Check if String literal contains Unicode control chars. 352 * 353 * @param literal String literal. 354 * @param pattern RegExp for valid characters. 355 * @return true, if String literal contains Unicode control chars. 356 */ 357 private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { 358 final int unicodeMatchesCounter = 359 countMatches(UNICODE_REGEXP, literal); 360 final int unicodeValidMatchesCounter = 361 countMatches(pattern, literal); 362 return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; 363 } 364 365 /** 366 * Check if trail comment is present after ast token. 367 * 368 * @param ast current token. 369 * @return true if trail comment is present after ast token. 370 */ 371 private boolean hasTrailComment(DetailAST ast) { 372 boolean result = false; 373 final int lineNo = ast.getLineNo(); 374 if (singlelineComments.containsKey(lineNo)) { 375 result = true; 376 } 377 else { 378 final List<TextBlock> commentList = blockComments.get(lineNo); 379 if (commentList != null) { 380 final TextBlock comment = commentList.get(commentList.size() - 1); 381 final String line = getLines()[lineNo - 1]; 382 result = isTrailingBlockComment(comment, line); 383 } 384 } 385 return result; 386 } 387 388 /** 389 * Whether the C style comment is trailing. 390 * 391 * @param comment the comment to check. 392 * @param line the line where the comment starts. 393 * @return true if the comment is trailing. 394 */ 395 private static boolean isTrailingBlockComment(TextBlock comment, String line) { 396 return comment.getText().length != 1 397 || CommonUtil.isBlank(line.substring(comment.getEndColNo() + 1)); 398 } 399 400 /** 401 * Count regexp matches into String literal. 402 * 403 * @param pattern pattern. 404 * @param target String literal. 405 * @return count of regexp matches. 406 */ 407 private static int countMatches(Pattern pattern, String target) { 408 int matcherCounter = 0; 409 final Matcher matcher = pattern.matcher(target); 410 while (matcher.find()) { 411 matcherCounter++; 412 } 413 return matcherCounter; 414 } 415 416 /** 417 * Checks if all characters in String literal is escaped. 418 * 419 * @param literal current literal. 420 * @return true if all characters in String literal is escaped. 421 */ 422 private boolean isAllCharactersEscaped(String literal) { 423 return allowIfAllCharactersEscaped 424 && ALL_ESCAPED_CHARS.matcher(literal.substring(1, 425 literal.length() - 1)).find(); 426 } 427 428}