Tazio Mirandola

TOC:Hide

References
Basics Regex reference
Examples
Greedy / non-greedy
Miscellanea

references

basics Regex reference

.        any character, except for line breaks if dotall is false.
*        0 or more of the preceding character.
+        1 or more of the preceding character.
?        Preceding character is optional. Matches 0 or 1 occurrence.
a{4}     Exactly 4 a's
a{4,8}   Between (inclusive) 4 and 8 a's
a{9,}    or more a's
[abx-z]  One character of: a, b, or the range x-z
[XYZ]+   one or more of any of the characters in the set.
$        end of the string.
^        beginning of a string.
a|b      a or b
 
\d       A digit (same as [0-9])
\D       A non-digit (same as [^0-9])
\w       A word character (same as [_a-zA-Z0-9])
\W       A non-word character (same as [^_a-zA-Z0-9])
\s       A whitespace character
\S       A non-whitespace character
\b       A word boundary
\B       A non-word boundary
\n       A newline
\t       A tab
 
[^a-z]   When inside of a character class, the ^ means NOT; in this case, match anything that is NOT a lowercase letter.
[^abx-z] One character except: a, b, or the range x-z

var username = 'JohnSmith';
alert(/[A-Za-z_-]+/.test(username)); // returns true
 
var str = 'this is my string';
alert(str.split(/\s/)); // alerts "this, is, my, string"
 
var str = 'this is my this string';
alert(str.split(/\s/)[3]); // alerts "string"
 
var someString = 'Hello, World';
someString = someString.replace(/World/, 'Universe');
alert(someString); // alerts "Hello, Universe"
 
var name = 'JeffreyWay';
alert(name.match(/e/)); // alerts "e"
 
var email = 'test@gmail.com';
alert(email.replace(/([a-z\d_-]+)@([a-z\d_-]+)\.[a-z]{2,4}/ig, '$1, $2')); // "test, gmail"

Examples

Addresses

//Address: State code (US)
'/\\b(?:A[KLRZ]|C[AOT]|D[CE]|FL|GA|HI|I[ADLN]|K[SY]|LA|M[ADEINOST]|N[CDEHJMVY]|O[HKR]|PA|RI|S[CD]|T[NX]|UT|V[AT]|W[AIVY])\\b/'

//Address: ZIP code (US)
'\b[0-9]{5}(?:-[0-9]{4})?\b'
Columns
//Columns: Match a regex starting at a specific column on a line.
'^.{%SKIPAMOUNT%}(%REGEX%)'

//Columns: Range of characters on a line, captured into backreference 1
//Iterate over all matches to extract a column of text from a file
//E.g. to grab the characters in colums 8..10, set SKIPAMOUNT to 7, and CAPTUREAMOUNT to 3
'^.{%SKIPAMOUNT%}(.{%CAPTUREAMOUNT%})'

Credit cards

//Credit card: All major cards
'^(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|6011[0-9]{12}|3(?:0[0-5]|[68][0-9])[0-9]{11}|3[47][0-9]{13})$'

//Credit card: American Express
'^3[47][0-9]{13}$'

//Credit card: Diners Club
'^3(?:0[0-5]|[68][0-9])[0-9]{11}$'

//Credit card: Discover
'^6011[0-9]{12}$'

//Credit card: MasterCard
'^5[1-5][0-9]{14}$'

//Credit card: Visa
'^4[0-9]{12}(?:[0-9]{3})?$'

//Credit card: remove non-digits
'/[^0-9]+/'

CSV

//CSV: Change delimiter
//Changes the delimiter from a comma into a tab.
//The capturing group makes sure delimiters inside double-quoted entries are ignored.
'("[^"\r\n]*")?,(?![^",\r\n]*"$)'

//CSV: Complete row, all fields.
//Match complete rows in a comma-delimited file that has 3 fields per row,
//capturing each field into a backreference.
//To match CSV rows with more or fewer fields, simply duplicate or delete the capturing groups.
'^("[^"\r\n]*"|[^,\r\n]*),("[^"\r\n]*"|[^,\r\n]*),("[^"\r\n]*"|[^,\r\n]*)$'

//CSV: Complete row, certain fields.
//Set %SKIPLEAD% to the number of fields you want to skip at the start, and %SKIPTRAIL% to
//the number of fields you want to ignore at the end of each row.
//This regex captures 3 fields into backreferences.  To capture more or fewer fields,
//simply duplicate or delete the capturing groups.
'^(?:(?:"[^"\r\n]*"|[^,\r\n]*),){%SKIPLEAD%}("[^"\r\n]*"|[^,\r\n]*),("[^"\r\n]*"|[^,\r\n]*),("[^"\r\n]*"|[^,\r\n]*)(?:(?:"[^"\r\n]*"|[^,\r\n]*),){%SKIPTRAIL%}$'

//CSV: Partial row, certain fields
//Match the first SKIPLEAD+3 fields of each rows in a comma-delimited file that has SKIPLEAD+3
//or more fields per row.  The 3 fields after SKIPLEAD are each captured into a backreference.
//All other fields are ignored.  Rows that have less than SKIPLEAD+3 fields are skipped.
//To capture more or fewer fields, simply duplicate or delete the capturing groups.
'^(?:(?:"[^"\r\n]*"|[^,\r\n]*),){%SKIPLEAD%}("[^"\r\n]*"|[^,\r\n]*),("[^"\r\n]*"|[^,\r\n]*),("[^"\r\n]*"|[^,\r\n]*)'

//CSV: Partial row, leading fields
//Match the first 3 fields of each rows in a comma-delimited file that has 3 or more fields per row.
//The first 3 fields are each captured into a backreference.  All other fields are ignored.
//Rows that have less than 3 fields are skipped.  To capture more or fewer fields,
//simply duplicate or delete the capturing groups.
'^("[^"\r\n]*"|[^,\r\n]*),("[^"\r\n]*"|[^,\r\n]*),("[^"\r\n]*"|[^,\r\n]*)'

//CSV: Partial row, variable leading fields
//Match the first 3 fields of each rows in a comma-delimited file.
//The first 3 fields are each captured into a backreference.
//All other fields are ignored.  If a row has fewer than 3 field, some of the backreferences
//will remain empty.  To capture more or fewer fields, simply duplicate or delete the capturing groups.
//The question mark after each group makes that group optional.
'^("[^"\r\n]*"|[^,\r\n]*),("[^"\r\n]*"|[^,\r\n]*)?,("[^"\r\n]*"|[^,\r\n]*)?'
Dates
//Date d/m/yy and dd/mm/yyyy
//1/1/00 through 31/12/99 and 01/01/1900 through 31/12/2099
//Matches invalid dates such as February 31st
'\b(0?[1-9]|[12][0-9]|3[01])[- /.](0?[1-9]|1[012])[- /.](19|20)?[0-9]{2}\b'


//Delimiters: Replace commas with tabs
//Replaces commas with tabs, except for commas inside double-quoted strings
'((?:"[^",]*+")|[^,]++)*+,'

Dates

//Date dd/mm/yyyy
//01/01/1900 through 31/12/2099
//Matches invalid dates such as February 31st
'(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)[0-9]{2}'

//Date m/d/y and mm/dd/yyyy
//1/1/99 through 12/31/99 and 01/01/1900 through 12/31/2099
//Matches invalid dates such as February 31st
//Accepts dashes, spaces, forward slashes and dots as date separators
'\b(0?[1-9]|1[012])[- /.](0?[1-9]|[12][0-9]|3[01])[- /.](19|20)?[0-9]{2}\b'

//Date mm/dd/yyyy
//01/01/1900 through 12/31/2099
//Matches invalid dates such as February 31st
'(0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])[- /.](19|20)[0-9]{2}'

//Date yy-m-d or yyyy-mm-dd
//00-1-1 through 99-12-31 and 1900-01-01 through 2099-12-31
//Matches invalid dates such as February 31st
'\b(19|20)?[0-9]{2}[- /.](0?[1-9]|1[012])[- /.](0?[1-9]|[12][0-9]|3[01])\b'

//Date yyyy-mm-dd
//1900-01-01 through 2099-12-31
//Matches invalid dates such as February 31st
'(19|20)[0-9]{2}[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'

// Date "02-27-15  04:24PM" "09-25-14  11:08AM"
'/^([0-9\-\s\:]*(PM|AM){1})\s*

Email addresses

//Email address
//Use this version to seek out email addresses in random documents and texts.
//Does not match email addresses using an IP address instead of a domain name.
//Does not match email addresses on new-fangled top-level domains with more than 4 letters such as .museum.
//Including these increases the risk of false positives when applying the regex to random documents.
'\b[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'

//Email address (anchored)
//Use this anchored version to check if a valid email address was entered.
//Does not match email addresses using an IP address instead of a domain name.
//Does not match email addresses on new-fangled top-level domains with more than 4 letters such as .museum.
//Requires the "case insensitive" option to be ON.
'^[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$'

//Email address (anchored; no consecutive dots)
//Use this anchored version to check if a valid email address was entered.
//Improves on the original email address regex by excluding addresses with consecutive dots such as test@gmail.com
//Does not match email addresses using an IP address instead of a domain name.
//Does not match email addresses on new-fangled top-level domains with more than 4 letters such as .museum.
//Including these increases the risk of false positives when applying the regex to random documents.
'^[A-Z0-9._%-]+@(?:[A-Z0-9-]+\.)+[A-Z]{2,4}$'

//Email address (no consecutive dots)
//Use this version to seek out email addresses in random documents and texts.
//Improves on the original email address regex by excluding addresses with consecutive dots such as test@gmail.com
//Does not match email addresses using an IP address instead of a domain name.
//Does not match email addresses on new-fangled top-level domains with more than 4 letters such as .museum.
//Including these increases the risk of false positives when applying the regex to random documents.
'\b[A-Z0-9._%-]+@(?:[A-Z0-9-]+\.)+[A-Z]{2,4}\b'

// Email address (specific TLDs)
// Does not match email addresses using an IP address instead of a domain name.
// Matches all country code top level domains, and specific common top level domains.
'^[A-Z0-9._%-]+@[A-Z0-9.-]+\.(?:[A-Z]{2}|com|org|net|biz|info|name|aero|biz|info|jobs|museum|name)$'

// Email address: Replace with HTML link
'\b(?:mailto:)?([A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b'

HTML

// HTML comment
'<!--.*?-->'

// HTML file
// Matches a complete HTML file.  Place round brackets around the .*? parts you want to extract from the file.
// Performance will be terrible on HTML files that miss some of the tags
// (and thus won't be matched by this regular expression).  Use the atomic version instead when your search
// includes such files (the atomic version will also fail invalid files, but much faster).
'<html>.*?<head>.*?<title>.*?</title>.*?</head>.*?<body[^>]*>.*?</body>.*?</html>'

// HTML file (atomic)
// Matches a complete HTML file.  Place round brackets around the .*? parts you want to extract from the file.
// Atomic grouping maintains the regular expression's performance on invalid HTML files.
'<html>(?>.*?<head>)(?>.*?<title>)(?>.*?</title>)(?>.*?</head>)(?>.*?<body[^>]*>)(?>.*?</body>).*?</html>'

// HTML tag
// Matches the opening and closing pair of whichever HTML tag comes next.
// The name of the tag is stored into the first capturing group.
// The text between the tags is stored into the second capturing group.
'<([A-Z][A-Z0-9]*)[^>]*>(.*?)</\1>'

// HTML tag
// Matches the opening and closing pair of a specific HTML tag.
// Anything between the tags is stored into the first capturing group.
// Does NOT properly match tags nested inside themselves.
'<%TAG%[^>]*>(.*?)</%TAG%>'

// HTML tag
// Matches any opening or closing HTML tag, without its contents.
'</?[a-z][a-z0-9]*[^<>]*>'

// parse css
'([^{]+)\s*\{\s*([^}]+)\s*}'
'^(.*)\s\{([a-zA-Z0-9\-:;#" ]*)\s\}$'
'^([\S^{]+)\s*\{\s*([^}]+)\s*}$'

trova uno specifico tag:

preg_match('#<h([1-6])>(.+?)</h\1>#is', $html, $matches);
for ($i = 0, $j = count($matches[0]); $i < $j; $i++) {
    echo $matches[1][$i].' '.$matches[2][$i] . "\n";
}

greedy / non-greedy

You want your pattern to match the smallest possible string instead of the largest

to get a greedy quantifier inside of a pattern operating under a trailing /U, just add a ? to the end

print preg_match_all('/p.*/', "php");  // greedy
print preg_match_all('/p.*?/', "php"); // nongreedy
print preg_match_all('/p.*/U', "php"); // nongreedy

greedy:

$html = '<b>I am bold.</b> <i>I am italic.</i> <b>I am also bold.</b>';
preg_match_all('#<b>(.+)</b>#', $html, $bolds);
/*Array (
    [0] => I am bold.</b> <i>I am italic.</i> <b>I am also bold.
)*/

non-greedy:

$html = '<b>I am bold.</b> <i>I am italic.</i> <b>I am also bold.</b>';
preg_match_all('#<b>(.+?)</b>#', $html, $bolds);
print_r($bolds[1]);
/* Array (
    [0] => I am bold.
    [1] => I am also bold.
)*/

miscellanea

// clean non alphanumerical
// sostituisce i caratteri NON alfanumerici che possono dare problemi di digitazione
$pass = preg_replace('/[^A-Za-z0-9]/', '_', $pass);

spaces:

$re = '/\s+/'; // a whitespace
$re = '/\S+/'; // everything that isn't whitespace

$re = '/[A-Z'-]+/i'; // all upper and lowercase letters, apostrophes, and hyphens, but no spece

a single space: " " (one space).

one or more: " *" (that's two spaces and an asterisk) or " +" (one space and a plus).

If you're looking for common spacing, use "[ X]" or "[ X][ X]*" or "[ X]+" where X is the physical tab character (and each is preceded by a single space in all those examples).

in modern preg: "\s" and its variations are the way to go.

pay attention to word boundaries match start and end of lines as well, important when you're looking for words that may appear without preceding or following spaces.

remove all non valid characters:

$newtag = preg_replace ("/[^a-zA-Z0-9 ]/", "", $tag);
#                                    ^ space here

to ensure there's only one space between each word and none at the start or end, that's a little more complicated (and probably another question)

$newtag = preg_replace ("/ +/", " ", $tag); # convert all multispaces to space
$newtag = preg_replace ("/^ /", "", $tag);  # remove space from start
$newtag = preg_replace ("/ $/", "", $tag);  # and end

find links in HTML

function pc_link_extractor($s) {
  $a = array();
  $re = '/<a\s+.*?href=[\"\']?([^\"\'>]*)[\"\']?[^>]*>(.*?)<\/a>/i';
  if (preg_match_all($re, $s,$matches,PREG_SET_ORDER)) {
    foreach($matches as $match) {
      array_push($a,array($match[1],$match[2]));
    }
  }
  return $a;
}
// usage:
$page=<<<END
Click <a outsideurl=">here</a> to visit a computer book
publisher. Click <a href="http://www.sklar.com">over here</a> to visit
a computer book author.
END;
$links = pc_link_extractor($page);

parsing web server NCSA Combined Log Format:

$log_file = '/usr/local/apache/logs/access.log';
$pattern = '/^([^ ]+) ([^ ]+) ([^ ]+) (\[[^\]]+\]) "(.*) (.*) (.*)" ([0-9\-]+)
    ([0-9\-]+) "(.*)" "(.*)"$/';

$fh = fopen($log_file,'r') or die($php_errormsg);
$i = 1;
$requests = array();
while (! feof($fh)) {
    // read each line and trim off leading/trailing whitespace
    if ($s = trim(fgets($fh,16384))) {
        // match the line to the pattern
        if (preg_match($pattern,$s,$matches)) {
            /* put each part of the match in an appropriately-named
             * variable */
            list($whole_match,$remote_host,$logname,$user,$time,
                 $method,$request,$protocol,$status,$bytes,$referer,
                 $user_agent) = $matches;
             // keep track of the count of each request
            $requests[$request]++;
        } else {
            // complain if the line didn't match the pattern
            error_log("Can't parse line $i: $s");
        }
    }
    $i++;
}
fclose($fh) or die($php_errormsg);

// sort the array (in reverse) by number of requests
arsort($requests);

// print formatted results
foreach ($requests as $request => $accesses) {
    printf("%6d   %s\n",$accesses,$request);
}

TAZIO MIRANDOLA Web Applications Developer

references

basics Regex reference

Examples

greedy / non-greedy

miscellanea

Tazio Mirandola Web Applications Developer