|
<?php |
|
//To parse the following unevenly categorized columns: |
|
/* |
|
* col1 col2 col3 |
|
* ==== ==== ==== |
|
* 1 a b c d e 103 14 as d9 |
|
* 2 a 103 14 as d9 |
|
* 3 a 103 14 as d9 |
|
* |
|
* Into structured data like this: |
|
* |
|
* col1 col2 col3 |
|
* ==== ==== ==== |
|
* 1 a b c d e 103 14 as d9 |
|
* 2 a 103 14 as d9 |
|
* 3 a 103 14 as d9 |
|
*/ |
|
|
|
$headings = array('col1','col2','col3'); |
|
$header = "col1 col2 col3"; |
|
//get the $heading_pos_list by parsing the headings of each column with |
|
list($heading_pos_list, $lengths) = parse_heading($headings, $header); |
|
//Parse each line into a row structure |
|
$line = '1 a b c d e 103 14 as d9'; |
|
$row = parse_line_approx($line, $heading_pos_list); |
|
//this works for each column and line |
|
echo $row('col1'); |
|
//output: |
|
//1 a b c d e |
|
|
|
|
|
/************* Functions Below **************/ |
|
/** |
|
* parse_heading |
|
* |
|
* Get the position and length of each column name in $heading |
|
* if a column name is not in $line then remove it from $heading |
|
* |
|
* @param array $heading |
|
* List of column names to get info about from $line |
|
* Ex: array("Logical Drive Name", "LUN", "Controller", "Accessible by", "Logical Drive status"); |
|
* |
|
* @param string $line |
|
* The header for the columns |
|
* Ex: "Logical Drive Name LUN Controller Accessible by Logical Drive status" |
|
* |
|
* @param boolean $is_logging |
|
* Turn off log messages when a header is not found |
|
* |
|
* @return array $return_val |
|
* Multidimentional array containing the start position of each column and length |
|
* $return_val[0] = multi-element associative array |
|
* Ex: $return_val[0]['Controller'] |
|
* $return_val[0]['Accessible by'] |
|
* $return_val[0] contains the starting position for each header |
|
* $heading_key is the column name that has been like 'CONTROLLER' above |
|
* Ex: $return_val[0][$heading_key] = starting position of heading |
|
* $return_val[1] contains the length of each header |
|
* Ex: $return_val[1][$heading_key] = length of heading including white space after it |
|
* |
|
*/ |
|
function parse_heading(&$heading, $line, $is_logging = TRUE) { |
|
|
|
//search $line to find each start position of $heading |
|
$start_pos=array(); |
|
$lengthOf=array(); |
|
$num_headings = count($heading); |
|
|
|
for ($i = 0; $i < $num_headings; $i++){ |
|
$column_name = $heading[$i]; |
|
$matches = array(); |
|
$start = strpos($line, $column_name); |
|
if (preg_match("/^.*?$column_name\s*/", $line, $matches)){ |
|
$end = strlen($matches[0]); |
|
if($start !== FALSE) { |
|
$start_pos[$column_name] = $start; |
|
$lengthOf[$column_name] = $end-$start; |
|
} |
|
} else { |
|
$heading[$i] = NULL; |
|
if ($is_logging){ |
|
echo "Alert: Heading '$column_name' was not found.\n"; |
|
} |
|
} |
|
} |
|
|
|
//filter the elements of the array using the function strlen, |
|
//if strlen returns 0 then the element is removed |
|
$heading = array_filter($heading, 'strlen'); |
|
|
|
//return start positions and lengths |
|
return(array($start_pos, $lengthOf)); |
|
} |
|
|
|
|
|
/** |
|
* parse_line_approx |
|
* |
|
* For parsing columns with raged edges. Pull the data out of $line and |
|
* put it into an associative array of fields and keys found in $heading |
|
* Ex: |
|
* Array(0=>Array('key'=>0),1=>Array('key'=>3)). |
|
* |
|
* If the positions are close, but off by a few characters, then adjust |
|
* the start and end to the real edge of the column. This function is an |
|
* improvement on parse_line because it deals with ragged edges of columns. |
|
* |
|
* Example Usage: |
|
* Say we have the following data: |
|
* col1 col2 col3 |
|
* ==== ==== ==== |
|
* 1 a b c d e 103 14 as d9 |
|
* 2 a 103 14 as d9 |
|
* 3 a 103 14 as d9 |
|
* |
|
* If we want to transform the data into this |
|
* col1 col2 col3 |
|
* ==== ==== ==== |
|
* 1 a b c d e 103 14 as d9 |
|
* 2 a 103 14 as d9 |
|
* 3 a 103 14 as d9 |
|
* |
|
* We would do the following: |
|
* 1.) get the $heading_pos_list by parsing the headings of each column with |
|
* list($heading_pos_list,$lengths) = parse_heading(array('col1','col2','col3'),"col1 col2 col3") |
|
* 2.) Parse each line into a row structure |
|
* $row = parse_line_approx($line, $heading_pos_list) |
|
* 3.) echo "$row('col1')" etc. etc. for each column and line |
|
* 4.) format it into columns |
|
* |
|
* @param string $line |
|
* The input to be processed |
|
* |
|
* @param array $heading_pos_list |
|
* The return value you get from list($heading_pos_list,$lengths)=parse_heading(...) |
|
* used to determine where columns start ($return_value['some_key']) in a line |
|
* |
|
* @return array |
|
* An array of keys and coresponding values like |
|
* array('some_key1'=>'column_value1','some_key2'=>'column_value2','etc'=>'etc') |
|
*/ |
|
function parse_line_approx($line, $heading_pos_list) { |
|
|
|
//init vars |
|
//return variable |
|
$row = array(); |
|
//list of whitespaces and values in $line |
|
$chunks_array = array(); |
|
//list of positions of chunks in string $line |
|
$chunks_pos_array = array(); |
|
//list of distances of each chunk from every heading |
|
$chunks_dist_array = array(); |
|
//chunks as they are matched via preg_match_all |
|
$matches = array(); |
|
|
|
//get chunks |
|
preg_match_all("/\s+|[^\s]+/",$line,$matches,PREG_OFFSET_CAPTURE); |
|
while ( $el = array_shift($matches[0]) ){ |
|
$chunks_array[] = $el[0]; |
|
$chunks_pos_array[] = $el[1]; |
|
} |
|
|
|
//calculate distances between chunks and each column heading |
|
$counter = 0; |
|
$chunks_dist_array = array(); |
|
foreach ($heading_pos_list as $heading=>$start_val){ |
|
|
|
//begining of column |
|
$start = $start_val; |
|
|
|
foreach ($chunks_pos_array as $index=>$chunk_pos){ |
|
$distance = abs($start-$chunk_pos); |
|
$chunks_dist_array[$counter][] = array( "heading"=>$heading, |
|
"distance"=>$distance, |
|
"chunk_index"=>$index, |
|
"heading_pos"=>$start, |
|
"chunk_pos"=>$chunk_pos, |
|
"chunk_val"=>$chunks_array[$index] ); |
|
} |
|
$counter++; |
|
} |
|
|
|
//categorize into each heading by whichever is closest to the heading |
|
$processed_indexes = array(); |
|
foreach ($chunks_dist_array as $index=>$info){ |
|
if (!isset($chunks_dist_array[$index+1])){ |
|
//add remaining to the last column |
|
for ($i=0;$i<$len;$i++){ |
|
if (!in_array($info[$i]['chunk_index'],$processed_indexes)){ |
|
if (!isset($row[$info[$i]['heading']])){ |
|
$row[$info[$i]['heading']] = ""; |
|
} |
|
$row[$info[$i]['heading']] .= $info[$i]['chunk_val']; |
|
$processed_indexes[] = $info[$i]['chunk_index']; |
|
} |
|
} |
|
break; |
|
} |
|
$next_info = $chunks_dist_array[$index+1]; |
|
$len = count($info); |
|
$next_heading_pos = $next_info[0]['heading_pos']; |
|
$last_proc_index = 0; |
|
for ($i=0;$i<$len;$i++){ |
|
//categorize into each heading by whichever is closest to the heading |
|
/* |
|
col1 col2 col3 col1 col2 col3 |
|
================================ ================================ |
|
a b c d 103 14 as d9 => a b c 103 14 as d9 |
|
*/ |
|
if ($info[$i]['distance'] < $next_info[$i]['distance']){ |
|
if (!in_array($info[$i]['chunk_index'],$processed_indexes)){ |
|
if (!isset($row[$info[$i]['heading']])){ |
|
$row[$info[$i]['heading']] = ""; |
|
} |
|
$row[$info[$i]['heading']] .= $info[$i]['chunk_val']; |
|
$processed_indexes[] = $info[$i]['chunk_index']; |
|
$last_proc_index = $i; |
|
} |
|
} else if ($next_info[$i]['chunk_pos'] < $next_heading_pos ) { |
|
//columns are not always seperated by which chunk is closer to a column |
|
//take into account cases (like col1) with wide columns full of spaces: |
|
/* |
|
col1 col2 col3 col1 col2 col3 |
|
================================ ================================ |
|
a b c d e 103 14 as d9 => a b c d e 103 14 as d9 |
|
*/ |
|
|
|
//where the start of a col2 is demarcated by a larger space: |
|
if ( preg_match("/^\s{2,}|\t$/",$info[$i]['chunk_val']) ){ |
|
//if there is a big space then break into previous heading |
|
for ($j=$last_proc_index+1;$j<=$i;$j++){ |
|
$row[$info[$j]['heading']] .= $info[$j]['chunk_val']; |
|
$processed_indexes[] = $info[$j]['chunk_index']; |
|
} |
|
//found a logical break, process next column |
|
break 1; |
|
} |
|
|
|
} else { |
|
//too far away to categorize in this heading. Move to next heading |
|
break 1; |
|
} |
|
} |
|
} |
|
|
|
//clean leading and trailing whitespace |
|
foreach ($row as $heading=>$col_val){ |
|
$row[$heading] = trim($col_val); |
|
} |
|
|
|
return $row; |
|
|
|
} |