Published:
January 30, 2008How to parse a html document and replace the tag with their corresponding value
<?php
class parseHTML
{
/**
* HTML Form Parser
*
* @package HtmlFormParser
* @version $Id 1.0
* @author Vinod Ram
*$html_data is the content of html file that you have to parse
*$_POST is the array containing the value that is to replaced with
* @copyright 2008 vinodram
*/
public function parseForms($html_data)
{
$allowedChar = ‘a-zA-Z0-9\s\=\”\’\_\ ‘;
if (preg_match_all(“/<html.*>.+<\/html>/isU”, $html_data, $forms,PREG_SET_ORDER) )
{
$form = $forms[0][0];
/************************remove form tag******************************/
if ( preg_match_all(“/<form([$allowedChar]{0,})>/”, $form,
$formStrttag,PREG_SET_ORDER) )
{
$html_data=str_replace($formStrttag[0][0],”,$html_data);
//$form = preg_replace(‘/’.$formStrttag[0][0].’/’,”,$form);
}
if ( preg_match_all(“/<\/form>/”, $form, $formEnd,PREG_SET_ORDER) )
{
$html_data=str_replace($formEnd[0][0],”,$html_data);
//$form = preg_replace(‘/’.$formEnd[0][0].’/’,”,$form);
}
/*———————————- <input type=text entries————–*/
if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?text[\”‘]?([$allowedChar]{0,})[\/]?>/”,
$form, $texts,PREG_SET_ORDER) )
{
for($p=0;$p<count($texts);$p++)
{
$replacedTag=str_replace(‘/>’,’>’,$texts[$p][0]);
$html_data=str_replace($texts[$p][0],$replacedTag,$html_data);
$texts[$p][0]=$replacedTag;
$parsedHtmlTag[‘form_elemets’][$this->getName($texts[$p][0])] =
array(‘type’ => ‘text’,’value’ => $this->getValue($texts[$p][0]));
if(array_key_exists($this->getName($texts[$p][0]),$_POST))
{
$html_data = preg_replace(‘/’.$texts[$p][0].’/’,
$_POST[$this->getName($texts[$p][0])],$html_data);
}
}
}
/*————– <input type=hidden entries————–*/
if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?hidden[\”‘]?([$allowedChar]{0,})[\/]?>/iU”,
$form, $hiddens,PREG_SET_ORDER) )
{
for($i=0;$i<count($hiddens);$i++)
{
$parsedHtmlTag[‘form_elemets’][$this->getName($hiddens[$i][0])] =
array(‘type’ => ‘hidden’,’value’ => $this->getValue($hiddens[$i][0]));
$html_data = preg_replace(‘/’.$hiddens[$i][0].’/’,
$parsedHtmlTag[‘form_elemets’][$this->getName($hiddens[$i][0])][‘value’],$html_data);
}
}
/*—————————–<input type=password entries————-*/
if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?password[\”‘]?([$allowedChar]{0,})[\/]?>/iU”,
$form, $passwords) )
{
for($j=0;$j<count($passwords[0]);$j++)
{
$parsedHtmlTag[‘form_elemets’][$this->getName($passwords[0][$j])] =
array(‘type’ => ‘password’,’value’ => $this->getValue($passwords[0][$j]));
$html_data = preg_replace(‘/’.$passwords[0][$j].’/’,
$parsedHtmlTag[‘form_elemets’][$this->getName($passwords[0][$j])][‘value’],$html_data);
}
}
/*————————–<textarea entries——————————-*/
if ( preg_match_all(“/<textarea.*>.*<\/textarea>/isU”, $form, $textareas,PREG_SET_ORDER) )
{
for($k=0;$k<count($textareas[0]);$k++)
{
$parsedHtmlTag[‘form_elemets’][$this->getName($textareas[0][$k])] =
array(‘type’ => ‘textarea’,’value’ => $this->getTextAreaValue($textareas[0][$k]));
$textareas[0][$k]=str_replace(‘</textarea>’,'<\/textarea>’,$textareas[0][$k]);
$html_data = preg_replace(‘/’.$textareas[0][$k].’/’,
$parsedHtmlTag[‘form_elemets’][$this->getName($textareas[0][$k])][‘value’],$html_data);
}
}
/*—————<input type=checkbox entries———————————*/
if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?checkbox[\”‘]?([$allowedChar]{0,})[\/]?>/”,
$form, $checkboxes) )
{
for($a =0;$a<count($checkboxes[0]);$a++)
{
$parsedHtmlTag[‘form_elemets’][$this->getName($checkboxes[0][$a])] =
array(‘type’ => ‘checkbox’,’value’ => $this->getValue($checkboxes[0][$a]));
if ((array_key_exists($this->getName($checkboxes[0][$a]),$_POST)) &&
($_POST[$this->getName($checkboxes[0][$a])] ==
$parsedHtmlTag[‘form_elemets’][$this->getName($checkboxes[0][$a])][‘value’]))
{
if(array_key_exists($this->getName($checkboxes[0][$a]),$_POST))
{
$replacedTag=str_replace(‘/>’,’>’,$checkboxes[0][$a]);
$html_data=str_replace($checkboxes[0][$a],$replacedTag,$html_data);
$checkboxes[0][$a]=$replacedTag;
$html_data = preg_replace(‘/’.$checkboxes[0][$a].$parsedHtmlTag[‘form_elemets’]
[$this->getName($checkboxes[0][$a])][‘value’].’/’,
$_POST[$this->getName($checkboxes[0][$a])],$html_data);
}
}
else
{
$replacedTag=str_replace(‘/>’,’>’,$checkboxes[0][$a]);
$html_data=str_replace($checkboxes[0][$a],$replacedTag,$html_data);
$checkboxes[0][$a]=$replacedTag;
$html_data = preg_replace(‘/’.$checkboxes[0][$a].$parsedHtmlTag[‘form_elemets’]
[$this->getName($checkboxes[0][$a])][‘value’].’/’,”,$html_data);
}
}
}
/*————–<input type=radio entries—————————————- */
if (preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?radio[\”‘]?([$allowedChar]{0,})[\/]?>/”,
$form, $radios) )
{
for($m =0;$m<count($radios[0]);$m++)
{
if ( preg_match(“/checked/i”, $radios[0][$m]) )
{
$parsedHtmlTag[‘form_elemets’][$this->getName($radios[0][$m])] =
array(‘type’ => ‘radio’,’value’ => $this->getValue($radios[0][$m]));
if((array_key_exists($this->getName($radios[0][$m]),$_POST)) &&
($_POST[$this->getName($radios[0][$m])] == $this->getValue($radios[0][$m])))
{
$replacedTag=str_replace(‘/>’,’>’,$radios[0][$m]);
$html_data=str_replace($radios[0][$m],$replacedTag,$html_data);
$radios[0][$m]=$replacedTag;
$html_data = preg_replace(‘/’.$radios[0][$m].”\s+”.
$parsedHtmlTag[‘form_elemets’][$this->getName($radios[0][$m])][‘value’].’/’,
$_POST[$this->getName($radios[0][$m])],$html_data);
}
else
{
$replacedTag=str_replace(‘/>’,’>’,$radios[0][$m]);
$html_data=str_replace($radios[0][$m],$replacedTag,$html_data);
$radios[0][$m]=$replacedTag;
$html_data = preg_replace(‘/’.$radios[0][$m].”\s+”.
$parsedHtmlTag[‘form_elemets’][$this->getName($radios[0][$m])][‘value’].’/’,”,
$html_data);
}
}
else
{
$parsedHtmlTag[‘form_elemets’][$this->getName($radios[0][$m])] =
array(‘type’ => ‘radio’,’value’ => $this->getValue($radios[0][$m]));
if((array_key_exists($this->getName($radios[0][$m]),$_POST)) &&
($_POST[$this->getName($radios[0][$m])] == $this->getValue($radios[0][$m])))
{
$replacedTag=str_replace(‘/>’,’>’,$radios[0][$m]);
$html_data=str_replace($radios[0][$m],$replacedTag,$html_data);
$radios[0][$m]=$replacedTag;
$html_data = preg_replace(‘/’.$radios[0][$m].”\s+”.
$parsedHtmlTag[‘form_elemets’][$this->getName($radios[0][$m])][‘value’].’/’,
$_POST[$this->getName($radios[0][$m])],$html_data);
}
else
{
$replacedTag=str_replace(‘/>’,’>’,$radios[0][$m]);
$html_data=str_replace($radios[0][$m],$replacedTag,$html_data);
$radios[0][$m]=$replacedTag;
$html_data = preg_replace(‘/’.$radios[0][$m].”\s+”.
$parsedHtmlTag[‘form_elemets’][$this->getName($radios[0][$m])][‘value’].’/’,”,
$html_data);
}
}
}
}
/*—————————–<input type=submit entries———————————*/
if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?submit[\”‘]?([$allowedChar]{0,})[\/]?>/iU”,
$form, $submits) )
{
foreach ( $submits[0] as $submit )
{
$parsedHtmlTag[‘buttons’][$this->button_counter] =
array(‘type’ => ‘submit’,’name’ =>
$this->getName($submit),’value’ => $this->getValue($submit));
$this->button_counter++;
}
}
/*—————-<input type=button entries—————————-*/
if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?button[\”‘]?([$allowedChar]{0,})[\/]?>/iU”,
$form, $buttons) )
{
foreach ( $buttons[0] as $button )
{
$parsedHtmlTag[‘buttons’][$this->button_counter] =
array(‘type’ => ‘button’,’name’ => $this->getName($button),’value’ => $this->getValue($button));
$this->button_counter++;
}
}
/*———————————– <input type=reset entries——————–*/
if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?reset[\”‘]?([$allowedChar]{0,})[\/]?>/iU”,
$form, $resets) )
{
foreach ( $resets[0] as $reset )
{
$parsedHtmlTag[‘buttons’][$this->button_counter] =
array(‘type’ => ‘reset’,’name’ => $this->getName($reset),’value’ => $this->getValue($reset));
$this->button_counter++;
}
}
/*———————————<input type=image entries————————-*/
if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?image[\”‘]?([$allowedChar]{0,})[\/]?>/iU”,
$form, $images) )
{
foreach ( $images[0] as $image )
{
$parsedHtmlTag[‘buttons’][$this->button_counter] =
array(‘type’ => ‘reset’,’name’ => $this->getName($image),’value’ => $this->getValue($image));
$this->button_counter++;
}
}
/*
* <input type=select entries
* Here I have to go on step around to grep at first all select names and then
* the content. Seems not to work in an other way
*/
if (preg_match_all(“/<select.*>.+<\/select>/isU”, $form, $selects) )
{
for($n=0;$n<count($selects[0]);$n++)
{
if ( preg_match_all(“/<option.*>.+<\/option>/isU”, $selects[0][$n], $all_options) )
{
foreach ( $all_options[0] as $option )
{
if ( preg_match(“/selected/i”, $option) )
{
if ( preg_match(“/value=[\”‘](.*)[\”‘]\s/iU”, $option, $option_value) )
{
$option_value = $option_value[1];
$found_selected = 1;
}
else
{
preg_match(“/<option.*>(.*)<\/option>/isU”, $option, $option_value);
$option_value = $option_value[1];
$found_selected = 1;
}
}
}
if ( !isset($found_selected) )
{
if ( preg_match(“/value=[\”‘](.*)[\”‘]/iU”, $all_options[0][0], $option_value) )
{
$option_value = $option_value[1];
}
else
{
preg_match(“/<option>(.*)<\/option>/iU”, $all_options[0][0], $option_value);
$option_value = $option_value[1];
}
}
else
{
unset($found_selected);
}
$parsedHtmlTag[‘form_elemets’][$this->getName($selects[0][$n])] =
array(‘type’ => ‘select’,’value’ => trim($option_value));
$html_data = str_replace(“</select>”,”<\/select>”,$html_data);
$html_data=str_replace(“\r\n”,””,$html_data);
$html_data=preg_replace(“/[\s]+/”,” “,$html_data);
$stringToReplace = $selects[0][$n];
$stringToReplace=str_replace(“</select>”,”<\/select>”,$stringToReplace);
$stringToReplace=str_replace(“\r\n”,””,$stringToReplace);
$stringToReplace=preg_replace(“/[\s]+/”,” “,$stringToReplace);
if($this->getName($stringToReplace) == ‘myDay’)
{
if($parsedHtmlTag[‘form_elemets’][$this->getName($stringToReplace)][‘value’] != ‘-1’)
{
if(array_key_exists($this->getName($stringToReplace),$_POST))
{
$replaceWith = date(“j”,mktime(0, 0, 0, 0,
$_POST[$this->getName($stringToReplace)],0));
}
}
else
{
$replaceWith = ”;
}
}
elseif($this->getName($stringToReplace) == ‘myMonth’)
{
if($parsedHtmlTag[‘form_elemets’][$this->getName($stringToReplace)][‘value’] != ‘-1’)
{
if(array_key_exists($this->getName($stringToReplace),$_POST))
{
$replaceWith = date(“F”,mktime(0, 0, 0,
$_POST[$this->getName($stringToReplace)]+1, 0, 0));
}
}
else
{
$replaceWith = ”;
}
}
else if($this->getName($stringToReplace) == ‘myYear’)
{
if(array_key_exists($this->getName($stringToReplace),$_POST))
{
if($_POST[$this->getName($stringToReplace)] != -1)
{
$replaceWith = date(“Y”,mktime(0, 0, 0, 0, 0,
$_POST[$this->getName($stringToReplace)]+1));
}
else
{
$replaceWith = ”;
}
}
else
{
$replaceWith = ”;
}
}
else
{
if(array_key_exists($this->getName($stringToReplace),$_POST))
{
$replaceWith = $_POST[$this->getName($stringToReplace)];
}
else
$replaceWith = ”;
}
$html_data=str_replace(“$stringToReplace”,$replaceWith,$html_data);
}
}
}
}
return $html_data;
}
/**
* Get Name from string
* @access private
* @param string
* @return string
*/
public function getName( $string )
{
if ( preg_match(“/name=[\”‘]?([\w\s]*)[\”‘]?[\s>]/i”, $string, $match) )
{
$val_match = preg_replace(“/\”‘/”, “”, trim($match[1]));
unset($string);
return $val_match;
}
}
/**
* Get Value from string
* @access private
* @param string
* @return string
*/
public function getValue( $string )
{
if ( preg_match(“/value=(\”([^\”]*)\”|'([^’]*)’|[^>\s]*)([^>]*)?>/is”, $string, $match) )
{
$val_match = trim($match[1]);
if ( strstr($val_match, ‘”‘) )
{
$val_match = str_replace(‘”‘, ”, $val_match);
}
unset($string);
return $val_match;
}
}
public function getTextAreaValue($string)
{
preg_match_all(“/<textarea.*>(.*)<\/textarea>/isU”, $string, $matches, PREG_SET_ORDER);
return $matches[0][1];
}
}
?>