/* * [TestRegexFindQuotedString.java] * * Summary: Finding a quoted String with a regex. * * Copyright: (c) 2012-2017 Roedy Green, Canadian Mind Products, http://mindprod.com * * Licence: This software may be copied and used freely for any purpose but military. * http://mindprod.com/contact/nonmil.html * * Requires: JDK 1.8+ * * Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/ * * Version History: * 1.0 2012-05-25 initial release * 1.1 2012-05-26 make program verify its own results. */ package com.mindprod.example; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.lang.System.*; /** * Finding a quoted String with a regex. *

* This program is based on newsgroup posts by markspace (aka Brendan), Lew and Robert Klemme * in response to my query about the cleanest way to use a regex to find quoted Strings. * * @author Roedy Green, Canadian Mind Products * @version 1.1 2012-05-26 make program verify its own results. * @since 2012-05-25 */ public class TestRegexFindQuotedString { /** * test string to search alternating with what Patter should extract */ private static final String[] alternatingTestExpectedPairs = { "basic: href=\"http://mindprod.com\" ", "\"http://mindprod.com\"", "Nested quote: George said \"that's the ticket\".", "\"that's the ticket\"", "Nested tick: Jeb replied '\"ticket?\"what ticket'.", "'\"ticket?\"what ticket'", "Non-ASCII: \"How na\u00efve!\".", "\"How na\u00efve!\"", "empty: \"\"xx", "\"\"", "\\ escaped: 'Bob\\'s your uncle.'", "'Bob\\'s your uncle.'", "unbalanced (should fail): 'wonky\"", "", }; /** * exercise a pattern to see if it finds the expected quoted string. */ private static void exercisePattern( Pattern pattern ) { out.println(); out.println( "Pattern: " + pattern.toString() ); // display with Java string level quoting peeled off. for ( int i = 0; i < alternatingTestExpectedPairs.length; i += 2 ) { final String test = alternatingTestExpectedPairs[ i ]; final String expected = alternatingTestExpectedPairs[ i + 1 ]; final Matcher m = pattern.matcher( test ); boolean found = m.find(); final boolean correct; final String extracted; if ( found ) { extracted = m.group( 0 ); correct = extracted.equals( expected ); } else { extracted = null; correct = false; } out.println( test + ", found: " + found + ", correct: " + correct + " (" + extracted + ")" ); } } /** * test harness to exercise various candidate Patterns for finding quoted Strings. * * @param args not used */ public static void main( String[] args ) { // We want to find Strings of the form "xx'xx" or 'xx"xx' // We want to avoid the following problems: // 1. Works even if String contains foreign languages, even Russian or accented letters. // 2. If starts with " must end with ", if starts with ' must end with '. // 3. ' is ok inside "...", and " is ok inside '...' // 4. It should accept empty strings "" and ''. // 5. We usually don't worry about how to use ' inside '...' since there are so many different conventions, // but for bonus points ignore \' and \". // here are some suggested candidate Patterns to find quoted Strings: exercisePattern( Pattern.compile( "[\"']\\p{Print}+?[\"']" ) ); // fails 1 2 3 4 exercisePattern( Pattern.compile( "[\"'][^\"']+[\"']" ) ); // fails 2 3 4 exercisePattern( Pattern.compile( "([\"'])[^\"']+\\1" ) ); // fails 3 4, uses a capturing group. exercisePattern( Pattern.compile( "\"[^\"]+\"|'[^']+'" ) ); // fails 4 exercisePattern( Pattern.compile( "\"[^\"]*\"|'[^']*'" ) ); // works, but fails bonus 5 exercisePattern( Pattern.compile( "\"(?:\\\\.|[^\\\"])*\"|'(?:\\\\.|[^\\'])*'" ) ); // works, even passes 5. // (?: ) is a non-capturing group. \\\\ is a literal \. . means any char. // In the above code, I pass a Pattern rather than a more concise raw String // because when I do it that way the IntelliJ IDE // does some proofreading and formatting for me on the regexes. // A follow-on problem would be to find Patterns that extract just the contents of the string, without // the delimiters, and possibly even decode any embedded \. // These sorts of problem can get so hairy, it sometimes simpler to hand code a little parser or modify // one you have already done, e.g.a finite state automaton like the ones JDisplay/JPrep uses. // These of course will be much faster than a general purpose regex. } }