jackieku wrote:
@fincs
Could you review this
patch is valid or not? Is this the same with your fix?
Actually I have done it the other way around
Code:
// L14: Moved to separate function from RegExMatch, for use with callouts.
void RegExSetSubpatternVars(LPTSTR haystack, pcre *re, pcre_extra *extra, bool get_positions_not_substrings, Var &output_var, int *offset, int pattern_count, int captured_pattern_count, LPTSTR &mem_to_free
#ifdef UNICODE
, const char *utf8Haystack
#endif
)
{
// OTHERWISE, CONTINUE ON TO STORE THE SUBSTRINGS THAT MATCHED THE SUBPATTERNS (EVEN IF PCRE_ERROR_NOMATCH).
// For lookup performance, create a table of subpattern names indexed by subpattern number.
LPCSTR *subpat_name = NULL; // Set default as "no subpattern names present or available".
bool allow_dupe_subpat_names = false; // Set default.
LPCSTR name_table;
int name_count, name_entry_size;
if ( !pcre_fullinfo(re, extra, PCRE_INFO_NAMECOUNT, &name_count) // Success. Fix for v1.0.45.01: Don't check captured_pattern_count>=0 because PCRE_ERROR_NOMATCH can still have named patterns!
&& name_count // There's at least one named subpattern. Relies on short-circuit boolean order.
&& !pcre_fullinfo(re, extra, PCRE_INFO_NAMETABLE, &name_table) // Success.
&& !pcre_fullinfo(re, extra, PCRE_INFO_NAMEENTRYSIZE, &name_entry_size) ) // Success.
{
int pcre_options;
if (!pcre_fullinfo(re, extra, PCRE_INFO_OPTIONS, &pcre_options)) // Success.
allow_dupe_subpat_names = pcre_options & PCRE_DUPNAMES;
// For indexing simplicity, also include an entry for the main/entire pattern at index 0 even though
// it's never used because the entire pattern can't have a name without enclosing it in parentheses
// (in which case it's not the entire pattern anymore, but in fact subpattern #1).
size_t subpat_array_size = pattern_count * sizeof(LPCSTR);
subpat_name = (LPCSTR *)_alloca(subpat_array_size); // See other use of _alloca() above for reasons why it's used.
ZeroMemory(subpat_name, subpat_array_size); // Set default for each index to be "no name corresponds to this subpattern number".
for (int i = 0; i < name_count; ++i, name_table += name_entry_size)
{
// Below converts first two bytes of each name-table entry into the pattern number (it might be
// possible to simplify this, but I'm not sure if big vs. little-endian will ever be a concern).
subpat_name[(name_table[0] << 8) + name_table[1]] = name_table + 2; // For indexing simplicity, subpat_name[0] is for the main/entire pattern though it is never actually used for that because it can't be named without being enclosed in parentheses (in which case it becomes a subpattern).
// For simplicity and unlike PHP, IsPureNumeric() isn't called to forbid numeric subpattern names.
// It seems the worst than could happen if it is numeric is that it would overlap/overwrite some of
// the numerically-indexed elements in the output-array. Seems pretty harmless given the rarity.
}
}
//else one of the pcre_fullinfo() calls may have failed. The PCRE docs indicate that this realistically never
// happens unless bad inputs were given. So due to rarity, just leave subpat_name==NULL; i.e. "no named subpatterns".
// Make var_name longer than Max so that FindOrAddVar() will be able to spot and report var names
// that are too long, either because the base-name is too long, or the name becomes too long
// as a result of appending the array index number:
TCHAR var_name[MAX_VAR_NAME_LENGTH + 68]; // Allow +3 extra for "Len" and "Pos" suffixes, +1 for terminator, and +64 for largest sub-pattern name (actually it's 32, but 64 allows room for future expansion). 64 is also enough room for the largest 64-bit integer, 20 chars: 18446744073709551616
_tcscpy(var_name, output_var.mName); // This prefix is copied in only once, for performance.
size_t suffix_length, prefix_length = _tcslen(var_name);
LPTSTR var_name_suffix = var_name + prefix_length; // The position at which to copy the sequence number (index).
int always_use = output_var.IsLocal() ? ALWAYS_USE_LOCAL : ALWAYS_USE_GLOBAL;
int n, p = 1, *this_offset = offset + 2; // Init for both loops below.
Var *array_item;
bool subpat_not_matched;
if (get_positions_not_substrings)
{
int subpat_pos, subpat_len;
for (; p < pattern_count; ++p, this_offset += 2) // Start at 1 because above already did pattern #0 (the full pattern).
{
subpat_not_matched = (p >= captured_pattern_count || this_offset[0] < 0); // See comments in similar section below about this.
if (subpat_not_matched)
{
subpat_pos = 0;
subpat_len = 0;
}
else // NOTE: The formulas below work even for a capturing subpattern that wasn't actually matched, such as one of the following: (abc)|(123)
{
subpat_pos = this_offset[0];
subpat_len = this_offset[1] - this_offset[0]; // It seemed more convenient for scripts to store Length instead of an ending offset.
}
if (subpat_name && subpat_name[p]) // This subpattern number has a name, so store it under that name.
{
if (*subpat_name[p]) // This check supports allow_dupe_subpat_names. See comments below.
{
#ifdef UNICODE
CStringTCharFromUTF8 subpat_name_wide(subpat_name[p]);
#define TMP_SUBPAT_NAME subpat_name_wide
#else
#define TMP_SUBPAT_NAME subpat_name[p]
#endif
suffix_length = _stprintf(var_name_suffix, _T("Pos%s"), TMP_SUBPAT_NAME); // Append the subpattern to the array's base name.
if (array_item = g_script.FindOrAddVar(var_name, prefix_length + suffix_length, always_use))
array_item->Assign(UTF8PosToTPos(utf8Haystack, subpat_pos) + 1); // One-based (i.e. position zero means "not found").
suffix_length = _stprintf(var_name_suffix, _T("Len%s"), TMP_SUBPAT_NAME); // Append the subpattern name to the array's base name.
if (array_item = g_script.FindOrAddVar(var_name, prefix_length + suffix_length, always_use))
array_item->Assign(UTF8LenToTLen(utf8Haystack, subpat_pos, subpat_len));
// Fix for v1.0.45.01: Section below added. See similar section further below for comments.
if (!subpat_not_matched && allow_dupe_subpat_names) // Explicitly check subpat_not_matched not pos/len so that behavior is consistent with the default mode (non-position).
for (n = p + 1; n < pattern_count; ++n) // Search to the right of this subpat to find others with the same name.
if (subpat_name[n] && !stricmp(subpat_name[n], subpat_name[p])) // Case-insensitive because unlike PCRE, named subpatterns conform to AHK convention of insensitive variable names.
subpat_name[n] = ""; // Empty string signals subsequent iterations to skip it entirely.
}
//else an empty subpat name caused by "allow duplicate names". Do nothing (see comments above).
}
else // This subpattern has no name, so write it out as its pattern number instead. For performance and memory utilization, it seems best to store only one or the other (named or number), not both.
{
// For comments about this section, see the similar for-loop later below.
suffix_length = _stprintf(var_name_suffix, _T("Pos%d"), p); // Append the element number to the array's base name.
if (array_item = g_script.FindOrAddVar(var_name, prefix_length + suffix_length, always_use))
array_item->Assign(UTF8PosToTPos(utf8Haystack, subpat_pos) + 1); // One-based (i.e. position zero means "not found").
//else var couldn't be created: no error reporting currently, since it basically should never happen.
suffix_length = _stprintf(var_name_suffix, _T("Len%d"), p); // Append the element number to the array's base name.
if (array_item = g_script.FindOrAddVar(var_name, prefix_length + suffix_length, always_use))
array_item->Assign(UTF8LenToTLen(utf8Haystack, subpat_pos, subpat_len));
}
}
//goto free_and_return;
return;
} // if (get_positions_not_substrings)
// Otherwise, we're in get-substring mode (not offset mode), so store the substring that matches each subpattern.
for (; p < pattern_count; ++p, this_offset += 2) // Start at 1 because above already did pattern #0 (the full pattern).
{
// If both items in this_offset are -1, that means the substring wasn't populated because it's
// subpattern wasn't needed to find a match (or there was no match for *anything*). For example:
// "(xyz)|(abc)" (in which only one is subpattern will match).
// NOTE: PCRE isn't clear on this, but it seems likely that captured_pattern_count
// (returned from pcre_exec()) can be less than pattern_count (from pcre_fullinfo/
// PCRE_INFO_CAPTURECOUNT). So the below takes this into account by not trusting values
// in offset[] that are beyond captured_pattern_count. Further evidence of this is PCRE's
// pcre_copy_substring() function, which consults captured_pattern_count to decide whether to
// consult the offset array. The formula below works even if captured_pattern_count==PCRE_ERROR_NOMATCH.
subpat_not_matched = (p >= captured_pattern_count || this_offset[0] < 0); // Relies on short-circuit boolean order.
if (subpat_name && subpat_name[p]) // This subpattern number has a name, so store it under that name.
{
if (*subpat_name[p]) // This check supports allow_dupe_subpat_names. See comments below.
{
// This section is similar to the one in the "else" below, so see it for more comments.
#ifdef UNICODE
_tcscpy(var_name_suffix, CStringTCharFromUTF8(subpat_name[p])); // Append the subpat name to the array's base name. _tcscpy() seems safe because PCRE almost certainly enforces the 32-char limit on subpattern names.
#else
_tcscpy(var_name_suffix, subpat_name[p]); // Append the subpat name to the array's base name. _tcscpy() seems safe because PCRE almost certainly enforces the 32-char limit on subpattern names.
#endif
if (array_item = g_script.FindOrAddVar(var_name, 0, always_use))
{
if (subpat_not_matched)
array_item->Assign(); // Omit all parameters to make the var empty without freeing its memory (for performance, in case this RegEx is being used many times in a loop).
else
{
if (p < pattern_count-1 // i.e. there's at least one more subpattern after this one (if there weren't, making a copy of haystack wouldn't be necessary because overlap can't harm this final assignment).
&& haystack == array_item->Contents(FALSE)) // For more comments, see similar section higher above.
if (mem_to_free = _tcsdup(haystack))
haystack = mem_to_free;
array_item->Assign(haystack + UTF8PosToTPos(utf8Haystack, this_offset[0])
, UTF8LenToTLen(utf8Haystack, this_offset[0], this_offset[1] - this_offset[0]));
// Fix for v1.0.45.01: When the J option (allow duplicate named subpatterns) is in effect,
// PCRE returns entries for all the duplicates. But we don't want an unmatched duplicate
// to overwrite a previously matched duplicate. To prevent this, when we're here (i.e.
// this subpattern matched something), mark duplicate entries in the names array that lie
// to the right of this item to indicate that they should be skipped by subsequent iterations.
if (allow_dupe_subpat_names)
for (n = p + 1; n < pattern_count; ++n) // Search to the right of this subpat to find others with the same name.
if (subpat_name[n] && !stricmp(subpat_name[n], subpat_name[p])) // Case-insensitive because unlike PCRE, named subpatterns conform to AHK convention of insensitive variable names.
subpat_name[n] = ""; // Empty string signals subsequent iterations to skip it entirely.
}
}
//else var couldn't be created: no error reporting currently, since it basically should never happen.
}
//else an empty subpat name caused by "allow duplicate names". Do nothing (see comments above).
}
else // This subpattern has no name, so instead write it out as its actual pattern number. For performance and memory utilization, it seems best to store only one or the other (named or number), not both.
{
_itot(p, var_name_suffix, 10); // Append the element number to the array's base name.
// To help performance (in case the linked list of variables is huge), tell it where
// to start the search. Use the base array name rather than the preceding element because,
// for example, Array19 is alphabetially less than Array2, so we can't rely on the
// numerical ordering:
if (array_item = g_script.FindOrAddVar(var_name, 0, always_use))
{
if (subpat_not_matched)
array_item->Assign(); // Omit all parameters to make the var empty without freeing its memory (for performance, in case this RegEx is being used many times in a loop).
else
{
if (p < pattern_count-1 // i.e. there's at least one more subpattern after this one (if there weren't, making a copy of haystack wouldn't be necessary because overlap can't harm this final assignment).
&& haystack == array_item->Contents(FALSE)) // For more comments, see similar section higher above.
if (mem_to_free = _tcsdup(haystack))
haystack = mem_to_free;
array_item->Assign(haystack + UTF8PosToTPos(utf8Haystack, this_offset[0])
, UTF8LenToTLen(utf8Haystack, this_offset[0], this_offset[1] - this_offset[0]));
}
}
//else var couldn't be created: no error reporting currently, since it basically should never happen.
}
} // for() each subpattern.
}
Anyway, my version is almost ready, I've now gotta merge your latest changes with mine
EDIT: Released.