001////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code for adherence to a set of rules.
003// Copyright (C) 2001-2020 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks;
021
022import java.util.List;
023import java.util.Map;
024import java.util.regex.Matcher;
025import java.util.regex.Pattern;
026
027import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
028import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
029import com.puppycrawl.tools.checkstyle.api.DetailAST;
030import com.puppycrawl.tools.checkstyle.api.TextBlock;
031import com.puppycrawl.tools.checkstyle.api.TokenTypes;
032import com.puppycrawl.tools.checkstyle.utils.CommonUtil;
033
034/**
035 * <p>
036 * Restricts using
037 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3">
038 * Unicode escapes</a>
039 * (such as &#92;u221e). It is possible to allow using escapes for
040 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
041 * non-printable, control characters</a>.
042 * Also, this check can be configured to allow using escapes
043 * if trail comment is present. By the option it is possible to
044 * allow using escapes if literal contains only them.
045 * </p>
046 * <ul>
047 * <li>
048 * Property {@code allowEscapesForControlCharacters} - Allow use escapes for
049 * non-printable, control characters.
050 * Default value is {@code false}.
051 * </li>
052 * <li>
053 * Property {@code allowByTailComment} - Allow use escapes if trail comment is present.
054 * Default value is {@code false}.
055 * </li>
056 * <li>
057 * Property {@code allowIfAllCharactersEscaped} - Allow if all characters in literal are escaped.
058 * Default value is {@code false}.
059 * </li>
060 * <li>
061 * Property {@code allowNonPrintableEscapes} - Allow use escapes for
062 * non-printable, whitespace characters.
063 * Default value is {@code false}.
064 * </li>
065 * </ul>
066 * <p>
067 * Examples of using Unicode:</p>
068 * <pre>
069 * String unitAbbrev = "μs";      // Best: perfectly clear even without a comment.
070 * String unitAbbrev = "&#92;u03bcs"; // Poor: the reader has no idea what this is.
071 * </pre>
072 * <p>
073 * An example of how to configure the check is:
074 * </p>
075 * <pre>
076 * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
077 * </pre>
078 * <p>
079 * An example of non-printable, control characters.
080 * </p>
081 * <pre>
082 * return '&#92;ufeff' + content; // byte order mark
083 * </pre>
084 * <p>
085 * An example of how to configure the check to allow using escapes
086 * for non-printable, control characters:
087 * </p>
088 * <pre>
089 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
090 *   &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
091 * &lt;/module&gt;
092 * </pre>
093 * <p>
094 * Example of using escapes with trail comment:
095 * </p>
096 * <pre>
097 * String unitAbbrev = "&#92;u03bcs"; // Greek letter mu, "s"
098 * </pre>
099 * <p>An example of how to configure the check to allow using escapes
100 * if trail comment is present:
101 * </p>
102 * <pre>
103 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
104 *   &lt;property name="allowByTailComment" value="true"/&gt;
105 * &lt;/module&gt;
106 * </pre>
107 * <p>Example of using escapes if literal contains only them:
108 * </p>
109 * <pre>
110 * String unitAbbrev = "&#92;u03bc&#92;u03bc&#92;u03bc";
111 * </pre>
112 * <p>An example of how to configure the check to allow escapes
113 * if literal contains only them:
114 * </p>
115 * <pre>
116 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
117 *   &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
118 * &lt;/module&gt;
119 * </pre>
120 * <p>An example of how to configure the check to allow using escapes
121 * for non-printable, whitespace characters:
122 * </p>
123 * <pre>
124 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
125 *   &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
126 * &lt;/module&gt;
127 * </pre>
128 *
129 * @since 5.8
130 */
131@FileStatefulCheck
132public class AvoidEscapedUnicodeCharactersCheck
133    extends AbstractCheck {
134
135    /**
136     * A key is pointing to the warning message text in "messages.properties"
137     * file.
138     */
139    public static final String MSG_KEY = "forbid.escaped.unicode.char";
140
141    /** Regular expression for Unicode chars. */
142    private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}");
143
144    /**
145     * Regular expression Unicode control characters.
146     *
147     * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
148     *     Appendix:Control characters</a>
149     */
150    private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\[uU]"
151            + "(00[0-1][0-9A-Fa-f]"
152            + "|00[8-9][0-9A-Fa-f]"
153            + "|00[aA][dD]"
154            + "|034[fF]"
155            + "|070[fF]"
156            + "|180[eE]"
157            + "|200[b-fB-F]"
158            + "|202[a-eA-E]"
159            + "|206[0-4a-fA-F]"
160            + "|[fF]{3}[9a-bA-B]"
161            + "|[fF][eE][fF]{2})");
162
163    /** Regular expression for all escaped chars. */
164    private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}"
165            + "|\""
166            + "|'"
167            + "|\\\\"
168            + "|\\\\b"
169            + "|\\\\f"
170            + "|\\\\n"
171            + "|\\\\r"
172            + "|\\\\t"
173            + ")+$");
174
175    /** Regular expression for escaped backslash. */
176    private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
177
178    /** Regular expression for non-printable unicode chars. */
179    private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000"
180            + "|\\\\u0009"
181            + "|\\\\u000[bB]"
182            + "|\\\\u000[cC]"
183            + "|\\\\u0020"
184            + "|\\\\u007[fF]"
185            + "|\\\\u0085"
186            + "|\\\\u009[fF]"
187            + "|\\\\u00[aA]0"
188            + "|\\\\u00[aA][dD]"
189            + "|\\\\u04[fF]9"
190            + "|\\\\u05[bB][eE]"
191            + "|\\\\u05[dD]0"
192            + "|\\\\u05[eE][aA]"
193            + "|\\\\u05[fF]3"
194            + "|\\\\u05[fF]4"
195            + "|\\\\u0600"
196            + "|\\\\u0604"
197            + "|\\\\u061[cC]"
198            + "|\\\\u06[dD]{2}"
199            + "|\\\\u06[fF]{2}"
200            + "|\\\\u070[fF]"
201            + "|\\\\u0750"
202            + "|\\\\u077[fF]"
203            + "|\\\\u0[eE]00"
204            + "|\\\\u0[eE]7[fF]"
205            + "|\\\\u1680"
206            + "|\\\\u180[eE]"
207            + "|\\\\u1[eE]00"
208            + "|\\\\u2000"
209            + "|\\\\u2001"
210            + "|\\\\u2002"
211            + "|\\\\u2003"
212            + "|\\\\u2004"
213            + "|\\\\u2005"
214            + "|\\\\u2006"
215            + "|\\\\u2007"
216            + "|\\\\u2008"
217            + "|\\\\u2009"
218            + "|\\\\u200[aA]"
219            + "|\\\\u200[fF]"
220            + "|\\\\u2025"
221            + "|\\\\u2028"
222            + "|\\\\u2029"
223            + "|\\\\u202[fF]"
224            + "|\\\\u205[fF]"
225            + "|\\\\u2064"
226            + "|\\\\u2066"
227            + "|\\\\u2067"
228            + "|\\\\u2068"
229            + "|\\\\u2069"
230            + "|\\\\u206[aA]"
231            + "|\\\\u206[fF]"
232            + "|\\\\u20[aA][fF]"
233            + "|\\\\u2100"
234            + "|\\\\u213[aA]"
235            + "|\\\\u3000"
236            + "|\\\\u[dD]800"
237            + "|\\\\u[fF]8[fF]{2}"
238            + "|\\\\u[fF][bB]50"
239            + "|\\\\u[fF][dD][fF]{2}"
240            + "|\\\\u[fF][eE]70"
241            + "|\\\\u[fF][eE][fF]{2}"
242            + "|\\\\u[fF]{2}0[eE]"
243            + "|\\\\u[fF]{2}61"
244            + "|\\\\u[fF]{2}[dD][cC]"
245            + "|\\\\u[fF]{3}9"
246            + "|\\\\u[fF]{3}[aA]"
247            + "|\\\\u[fF]{3}[bB]"
248            + "|\\\\u[fF]{4}");
249
250    /** Cpp style comments. */
251    private Map<Integer, TextBlock> singlelineComments;
252    /** C style comments. */
253    private Map<Integer, List<TextBlock>> blockComments;
254
255    /** Allow use escapes for non-printable, control characters. */
256    private boolean allowEscapesForControlCharacters;
257
258    /** Allow use escapes if trail comment is present. */
259    private boolean allowByTailComment;
260
261    /** Allow if all characters in literal are escaped. */
262    private boolean allowIfAllCharactersEscaped;
263
264    /** Allow use escapes for non-printable, whitespace characters. */
265    private boolean allowNonPrintableEscapes;
266
267    /**
268     * Setter to allow use escapes for non-printable, control characters.
269     *
270     * @param allow user's value.
271     */
272    public final void setAllowEscapesForControlCharacters(boolean allow) {
273        allowEscapesForControlCharacters = allow;
274    }
275
276    /**
277     * Setter to allow use escapes if trail comment is present.
278     *
279     * @param allow user's value.
280     */
281    public final void setAllowByTailComment(boolean allow) {
282        allowByTailComment = allow;
283    }
284
285    /**
286     * Setter to allow if all characters in literal are escaped.
287     *
288     * @param allow user's value.
289     */
290    public final void setAllowIfAllCharactersEscaped(boolean allow) {
291        allowIfAllCharactersEscaped = allow;
292    }
293
294    /**
295     * Setter to allow use escapes for non-printable, whitespace characters.
296     *
297     * @param allow user's value.
298     */
299    public final void setAllowNonPrintableEscapes(boolean allow) {
300        allowNonPrintableEscapes = allow;
301    }
302
303    @Override
304    public int[] getDefaultTokens() {
305        return getRequiredTokens();
306    }
307
308    @Override
309    public int[] getAcceptableTokens() {
310        return getRequiredTokens();
311    }
312
313    @Override
314    public int[] getRequiredTokens() {
315        return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL};
316    }
317
318    @Override
319    public void beginTree(DetailAST rootAST) {
320        singlelineComments = getFileContents().getSingleLineComments();
321        blockComments = getFileContents().getBlockComments();
322    }
323
324    @Override
325    public void visitToken(DetailAST ast) {
326        final String literal = ast.getText();
327
328        if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
329                || isAllCharactersEscaped(literal)
330                || allowEscapesForControlCharacters
331                        && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
332                || allowNonPrintableEscapes
333                        && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
334            log(ast, MSG_KEY);
335        }
336    }
337
338    /**
339     * Checks if literal has Unicode chars.
340     *
341     * @param literal String literal.
342     * @return true if literal has Unicode chars.
343     */
344    private static boolean hasUnicodeChar(String literal) {
345        final String literalWithoutEscapedBackslashes =
346                ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
347        return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
348    }
349
350    /**
351     * Check if String literal contains Unicode control chars.
352     *
353     * @param literal String literal.
354     * @param pattern RegExp for valid characters.
355     * @return true, if String literal contains Unicode control chars.
356     */
357    private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
358        final int unicodeMatchesCounter =
359                countMatches(UNICODE_REGEXP, literal);
360        final int unicodeValidMatchesCounter =
361                countMatches(pattern, literal);
362        return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
363    }
364
365    /**
366     * Check if trail comment is present after ast token.
367     *
368     * @param ast current token.
369     * @return true if trail comment is present after ast token.
370     */
371    private boolean hasTrailComment(DetailAST ast) {
372        boolean result = false;
373        final int lineNo = ast.getLineNo();
374        if (singlelineComments.containsKey(lineNo)) {
375            result = true;
376        }
377        else {
378            final List<TextBlock> commentList = blockComments.get(lineNo);
379            if (commentList != null) {
380                final TextBlock comment = commentList.get(commentList.size() - 1);
381                final String line = getLines()[lineNo - 1];
382                result = isTrailingBlockComment(comment, line);
383            }
384        }
385        return result;
386    }
387
388    /**
389     * Whether the C style comment is trailing.
390     *
391     * @param comment the comment to check.
392     * @param line the line where the comment starts.
393     * @return true if the comment is trailing.
394     */
395    private static boolean isTrailingBlockComment(TextBlock comment, String line) {
396        return comment.getText().length != 1
397            || CommonUtil.isBlank(line.substring(comment.getEndColNo() + 1));
398    }
399
400    /**
401     * Count regexp matches into String literal.
402     *
403     * @param pattern pattern.
404     * @param target String literal.
405     * @return count of regexp matches.
406     */
407    private static int countMatches(Pattern pattern, String target) {
408        int matcherCounter = 0;
409        final Matcher matcher = pattern.matcher(target);
410        while (matcher.find()) {
411            matcherCounter++;
412        }
413        return matcherCounter;
414    }
415
416    /**
417     * Checks if all characters in String literal is escaped.
418     *
419     * @param literal current literal.
420     * @return true if all characters in String literal is escaped.
421     */
422    private boolean isAllCharactersEscaped(String literal) {
423        return allowIfAllCharactersEscaped
424                && ALL_ESCAPED_CHARS.matcher(literal.substring(1,
425                        literal.length() - 1)).find();
426    }
427
428}