| 
									
										
										
										
											2001-04-09 19:54:03 +00:00
										 |  |  | /************************************************************************* | 
					
						
							|  |  |  | *									 * | 
					
						
							|  |  |  | *	 YAP Prolog 							 * | 
					
						
							|  |  |  | *									 * | 
					
						
							|  |  |  | *	Yap Prolog was developed at NCCUP - Universidade do Porto	 * | 
					
						
							|  |  |  | *									 * | 
					
						
							|  |  |  | * Copyright L.Damas, V.S.Costa and Universidade do Porto 1985-1997	 * | 
					
						
							|  |  |  | *									 * | 
					
						
							|  |  |  | ************************************************************************** | 
					
						
							|  |  |  | *									 * | 
					
						
							|  |  |  | * File:		regexp.yap						 * | 
					
						
							|  |  |  | * Last rev:	3/22/2000						 * | 
					
						
							|  |  |  | * mods:									 * | 
					
						
							|  |  |  | * comments:	Support for Regular Expressions	in YAP			 * | 
					
						
							|  |  |  | *									 * | 
					
						
							|  |  |  | *************************************************************************/ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-11-18 15:06:25 +00:00
										 |  |  | /** | 
					
						
							|  |  |  |  * @file   regexp.yap | 
					
						
							|  |  |  |  * @author VITOR SANTOS COSTA <vsc@VITORs-MBP.lan> | 
					
						
							|  |  |  |  * @date   Wed Nov 18 00:27:52 2015 | 
					
						
							|  |  |  |  *  | 
					
						
							|  |  |  |  * @brief  Support for Regular Expressions	in YAP | 
					
						
							|  |  |  |  *  | 
					
						
							|  |  |  |  *  | 
					
						
							|  |  |  | */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | :- module(regexp, [ | 
					
						
							|  |  |  | 	regexp/3, | 
					
						
							|  |  |  | 	regexp/4 | 
					
						
							|  |  |  |           ]). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /** @defgroup regexp Regular Expressions | 
					
						
							| 
									
										
										
										
											2015-01-04 23:58:23 +00:00
										 |  |  | @ingroup library | 
					
						
							| 
									
										
										
										
											2014-09-11 14:06:57 -05:00
										 |  |  | @{ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This library includes routines to determine whether a regular expression | 
					
						
							|  |  |  | matches part or all of a string. The routines can also return which | 
					
						
							|  |  |  | parts parts of the string matched the expression or subexpressions of | 
					
						
							|  |  |  | it. This library relies on Henry Spencer's `C`-package and is only | 
					
						
							|  |  |  | available in operating systems that support dynamic loading. The | 
					
						
							|  |  |  | `C`-code has been obtained from the sources of FreeBSD-4.0 and is | 
					
						
							|  |  |  | protected by copyright from Henry Spencer and from the Regents of the | 
					
						
							|  |  |  | University of California (see the file library/regex/COPYRIGHT for | 
					
						
							|  |  |  | further details). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Much of the description of regular expressions below is copied verbatim | 
					
						
							|  |  |  | from Henry Spencer's manual page. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A regular expression is zero or more branches, separated by ``|`.  It | 
					
						
							|  |  |  | matches anything that matches one of the branches. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A branch is zero or more pieces, concatenated.  It matches a match for | 
					
						
							|  |  |  | the first, followed by a match for the second, etc. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A piece is an atom possibly followed by `\*`, `+`, or `?`.  An atom | 
					
						
							|  |  |  | followed by `\*` matches a sequence of 0 or more matches of the atom. | 
					
						
							|  |  |  | An atom followed by `+` matches a sequence of 1 or more matches of the | 
					
						
							|  |  |  | atom.  An atom followed by `?` matches a match of the atom, or the | 
					
						
							|  |  |  | null string. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | An atom is a regular expression in parentheses (matching a match for the | 
					
						
							|  |  |  | regular expression), a range (see below), `.`  (matching any single | 
					
						
							|  |  |  | character), `^` (matching the null string at the beginning of the | 
					
						
							|  |  |  | input string), `$` (matching the null string at the end of the input | 
					
						
							|  |  |  | string), a `\` followed by a single character (matching that | 
					
						
							|  |  |  | character), or a single character with no other significance (matching | 
					
						
							|  |  |  | that character). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | A range is a sequence of characters enclosed in `[]`.  It normally | 
					
						
							|  |  |  | matches any single character from the sequence.  If the sequence begins | 
					
						
							|  |  |  | with `^`, it matches any single character not from the rest of the | 
					
						
							|  |  |  | sequence.  If two characters in the sequence are separated by `-`, | 
					
						
							|  |  |  | this is shorthand for the full list of ASCII characters between them | 
					
						
							|  |  |  | (e.g. `[0-9]` matches any decimal digit).  To include a literal `]` | 
					
						
							|  |  |  | in the sequence, make it the first character (following a possible | 
					
						
							|  |  |  | `^`).  To include a literal `-`, make it the first or last | 
					
						
							|  |  |  | character. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  @pred regexp(+ _RegExp_,+ _String_,+ _Opts_)  | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Match regular expression  _RegExp_ to input string  _String_ | 
					
						
							|  |  |  | according to options  _Opts_. The options may be: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | + `nocase`: Causes upper-case characters  in   _String_ to | 
					
						
							|  |  |  | be treated  as  lower case during the matching process. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   | 
					
						
							|  |  |  | */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /** @pred regexp(+ _RegExp_,+ _String_,+ _Opts_,? _SubMatchVars_) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Match regular expression  _RegExp_ to input string  _String_ | 
					
						
							|  |  |  | according to options  _Opts_. The variable  _SubMatchVars_ should | 
					
						
							|  |  |  | be originally unbound or a list of unbound variables all will contain a | 
					
						
							|  |  |  | sequence of matches, that is, the head of  _SubMatchVars_ will | 
					
						
							|  |  |  | contain the characters in  _String_ that matched the leftmost | 
					
						
							|  |  |  | parenthesized subexpression within  _RegExp_, the next head of list | 
					
						
							|  |  |  | will contain the characters that matched the next parenthesized | 
					
						
							|  |  |  | subexpression to the right in  _RegExp_, and so on. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | The options may be: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | + `nocase`: Causes upper-case characters  in   _String_ to | 
					
						
							|  |  |  | be treated  as  lower case during the matching process. | 
					
						
							|  |  |  | + `indices`: Changes what  is  stored  in | 
					
						
							|  |  |  |  _SubMatchVars_. Instead  of storing the matching characters from | 
					
						
							|  |  |  |  _String_, each variable will contain a term of the form  _IO-IF_ | 
					
						
							|  |  |  | giving the indices in  _String_ of the first and last characters  in | 
					
						
							|  |  |  | the  matching range of characters. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | In general there may be more than one way to match a regular expression | 
					
						
							|  |  |  | to an input string.  For example,  consider the command | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ~~~~~ | 
					
						
							|  |  |  |   regexp("(a*)b*","aabaaabb", [], [X,Y]) | 
					
						
							|  |  |  | ~~~~~ | 
					
						
							|  |  |  | Considering only the rules given so far,  _X_ and  _Y_ could end up | 
					
						
							|  |  |  | with the values `"aabb"` and `"aa"`, `"aaab"` and | 
					
						
							|  |  |  | `"aaa"`, `"ab"` and `"a"`, or any of several other | 
					
						
							|  |  |  | combinations.  To resolve this potential ambiguity `regexp` chooses among | 
					
						
							|  |  |  | alternatives using the rule `first then longest`.  In other words, it | 
					
						
							|  |  |  | considers the possible matches in order working from left to right | 
					
						
							|  |  |  | across the input string and the pattern, and it attempts to match longer | 
					
						
							|  |  |  | pieces of the input string before shorter ones.  More specifically, the | 
					
						
							|  |  |  | following rules apply in decreasing order of priority: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | + If a regular expression could match  two  different parts of an | 
					
						
							|  |  |  | input string then it will match the one that begins earliest. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | + If a regular expression contains "|"  operators  then the leftmost matching sub-expression is chosen. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | + In \*, +, and ? constructs, longer matches are chosen in preference to shorter ones. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | + In sequences of expression  components  the  components are considered from left to right. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | In the example above, `"(a\*)b\*"` matches `"aab"`: the | 
					
						
							|  |  |  | `"(a\*)"` portion of the pattern is matched first and it consumes | 
					
						
							|  |  |  | the leading `"aa"`; then the `"b\*"` portion of the pattern | 
					
						
							|  |  |  | consumes the next `"b"`.  Or, consider the following example:  | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ~~~~~ | 
					
						
							|  |  |  |   regexp("(ab|a)(b*)c",  "abc", [], [X,Y,Z]) | 
					
						
							|  |  |  | ~~~~~ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | After this command  _X_ will be `"abc"`,  _Y_ will be | 
					
						
							|  |  |  | `"ab"`, and  _Z_ will be an empty string.  Rule 4 specifies that | 
					
						
							|  |  |  | `"(ab|a)"` gets first shot at the input string and Rule 2 specifies | 
					
						
							|  |  |  | that the `"ab"` sub-expression is checked before the `"a"` | 
					
						
							|  |  |  | sub-expression.  Thus the `"b"` has already been claimed before the | 
					
						
							|  |  |  | `"(b\*)"` component is checked and `(b\*)` must match an empty string. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
										
										
											2004-09-30 21:37:41 +00:00
										 |  |  | :- load_foreign_files([regexp], [], init_regexp). | 
					
						
							| 
									
										
										
										
											2001-04-09 19:54:03 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | regexp(RegExp, String, Opts) :- | 
					
						
							|  |  |  | 	length(RegExp, LRE), | 
					
						
							|  |  |  | 	length(String, LS), | 
					
						
							|  |  |  | 	check_opts(Opts,0,IOpts,regexp(RegExp, String, Opts)), | 
					
						
							|  |  |  | 	check_regexp(RegExp,LRE,String,LS,IOpts). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | regexp(RegExp, String, Opts, OUT) :- | 
					
						
							|  |  |  | 	length(RegExp, LRE), | 
					
						
							|  |  |  | 	length(String, LS), | 
					
						
							|  |  |  | 	check_out(OUT,0,Count,regexp(RegExp, String, Opts, OUT)), | 
					
						
							|  |  |  | 	check_opts(Opts,0,IOpts,regexp(RegExp, String, Opts, OUT)), | 
					
						
							|  |  |  | 	check_regexp(RegExp,LRE,String,LS,IOpts,OUT,Count). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | % | 
					
						
							|  |  |  | % OUT must be bound to a list of unbound variables. | 
					
						
							|  |  |  | % Check this and count how many. | 
					
						
							|  |  |  | % | 
					
						
							| 
									
										
										
										
											2009-06-05 18:46:01 -05:00
										 |  |  | check_out(V,_,_,_) :- var(V), !. | 
					
						
							| 
									
										
										
										
											2001-04-09 19:54:03 +00:00
										 |  |  | check_out([],I,I,_) :- !. | 
					
						
							|  |  |  | check_out([V|L],I0,IF,G) :- !, | 
					
						
							| 
									
										
										
										
											2010-10-12 22:02:24 +01:00
										 |  |  | 	(nonvar(V) -> throw(error(uninstantiation_error(V),G)) ; true), | 
					
						
							| 
									
										
										
										
											2001-04-09 19:54:03 +00:00
										 |  |  | 	I is I0+1, | 
					
						
							|  |  |  | 	check_out(L,I,IF,G). | 
					
						
							|  |  |  | check_out(OUT,_,_,G) :- | 
					
						
							| 
									
										
										
										
											2010-10-12 22:02:24 +01:00
										 |  |  | 	throw(error(uninstantiation_error(OUT),G)). | 
					
						
							| 
									
										
										
										
											2001-04-09 19:54:03 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | % | 
					
						
							|  |  |  | % Option processing | 
					
						
							|  |  |  | % | 
					
						
							|  |  |  | check_opts(V,_,_,G) :- var(V), !, | 
					
						
							|  |  |  | 	throw(error(instantiation_error,G)). | 
					
						
							|  |  |  | check_opts([],I,I,_) :- !. | 
					
						
							|  |  |  | check_opts([A|L],I0,IF,G) :- !, | 
					
						
							|  |  |  | 	process_opt(A,I1,G), | 
					
						
							|  |  |  | 	I is I0+I1, | 
					
						
							|  |  |  | 	check_opts(L,I,IF,G). | 
					
						
							|  |  |  | check_opts(Opts,_,_,G) :- | 
					
						
							|  |  |  | 	throw(error(type_error(variable,Opts),G)). | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-06-20 13:48:45 +00:00
										 |  |  | process_opt(V,_,G) :- var(V), !, | 
					
						
							| 
									
										
										
										
											2001-04-09 19:54:03 +00:00
										 |  |  | 	throw(error(instantiation_error,G)). | 
					
						
							|  |  |  | process_opt(nocase,1,_) :- !. | 
					
						
							|  |  |  | process_opt(indices,2,_) :- !. | 
					
						
							|  |  |  | process_opt(I,_,G) :- | 
					
						
							|  |  |  | 	throw(error(domain_error(flag_value,regexp_options+I),G)). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 |