Blog Image

How to parse a html document and replace the tag with their corresponding value

<?php
class parseHTML
{
/**
* HTML Form Parser
*
* @package HtmlFormParser
* @version $Id 1.0
* @author Vinod Ram

*$html_data is the content of html file that you have to parse

*$_POST is the array containing the value that is to replaced with

* @copyright 2008 vinodram
*/
public function parseForms($html_data)
{

$allowedChar = ‘a-zA-Z0-9\s\=\”\’\_\ ‘;
if (preg_match_all(“/<html.*>.+<\/html>/isU”, $html_data, $forms,PREG_SET_ORDER) )
{
$form = $forms[0][0];

/************************remove form tag******************************/
if ( preg_match_all(“/<form([$allowedChar]{0,})>/”, $form,
$formStrttag,PREG_SET_ORDER) )
{
$html_data=str_replace($formStrttag[0][0],”,$html_data);
//$form = preg_replace(‘/’.$formStrttag[0][0].’/’,”,$form);
}
if ( preg_match_all(“/<\/form>/”, $form, $formEnd,PREG_SET_ORDER) )
{
$html_data=str_replace($formEnd[0][0],”,$html_data);
//$form = preg_replace(‘/’.$formEnd[0][0].’/’,”,$form);
}
/*———————————- <input type=text entries————–*/

if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?text[\”‘]?([$allowedChar]{0,})[\/]?>/”,
$form, $texts,PREG_SET_ORDER) )
{

for($p=0;$p<count($texts);$p++)
{
$replacedTag=str_replace(‘/>’,’>’,$texts[$p][0]);
$html_data=str_replace($texts[$p][0],$replacedTag,$html_data);
$texts[$p][0]=$replacedTag;
$parsedHtmlTag[‘form_elemets’][$this->getName($texts[$p][0])] =
array(‘type’ => ‘text’,’value’ => $this->getValue($texts[$p][0]));

if(array_key_exists($this->getName($texts[$p][0]),$_POST))
{
$html_data = preg_replace(‘/’.$texts[$p][0].’/’,
$_POST[$this->getName($texts[$p][0])],$html_data);
}

}

}

/*————– <input type=hidden entries————–*/

if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?hidden[\”‘]?([$allowedChar]{0,})[\/]?>/iU”,
$form, $hiddens,PREG_SET_ORDER) )
{
for($i=0;$i<count($hiddens);$i++)
{
$parsedHtmlTag[‘form_elemets’][$this->getName($hiddens[$i][0])] =
array(‘type’ => ‘hidden’,’value’ => $this->getValue($hiddens[$i][0]));
$html_data = preg_replace(‘/’.$hiddens[$i][0].’/’,
$parsedHtmlTag[‘form_elemets’][$this->getName($hiddens[$i][0])][‘value’],$html_data);
}
}

/*—————————–<input type=password entries————-*/

if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?password[\”‘]?([$allowedChar]{0,})[\/]?>/iU”,
$form, $passwords) )
{

for($j=0;$j<count($passwords[0]);$j++)
{
$parsedHtmlTag[‘form_elemets’][$this->getName($passwords[0][$j])] =
array(‘type’ => ‘password’,’value’ => $this->getValue($passwords[0][$j]));
$html_data = preg_replace(‘/’.$passwords[0][$j].’/’,
$parsedHtmlTag[‘form_elemets’][$this->getName($passwords[0][$j])][‘value’],$html_data);
}

}

/*————————–<textarea entries——————————-*/

if ( preg_match_all(“/<textarea.*>.*<\/textarea>/isU”, $form, $textareas,PREG_SET_ORDER) )
{
for($k=0;$k<count($textareas[0]);$k++)
{
$parsedHtmlTag[‘form_elemets’][$this->getName($textareas[0][$k])] =
array(‘type’ => ‘textarea’,’value’ => $this->getTextAreaValue($textareas[0][$k]));
$textareas[0][$k]=str_replace(‘</textarea>’,'<\/textarea>’,$textareas[0][$k]);
$html_data = preg_replace(‘/’.$textareas[0][$k].’/’,
$parsedHtmlTag[‘form_elemets’][$this->getName($textareas[0][$k])][‘value’],$html_data);
}
}

/*—————<input type=checkbox entries———————————*/

if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?checkbox[\”‘]?([$allowedChar]{0,})[\/]?>/”,
$form, $checkboxes) )
{

for($a =0;$a<count($checkboxes[0]);$a++)
{
$parsedHtmlTag[‘form_elemets’][$this->getName($checkboxes[0][$a])] =
array(‘type’ => ‘checkbox’,’value’ => $this->getValue($checkboxes[0][$a]));
if ((array_key_exists($this->getName($checkboxes[0][$a]),$_POST)) &&
($_POST[$this->getName($checkboxes[0][$a])] ==
$parsedHtmlTag[‘form_elemets’][$this->getName($checkboxes[0][$a])][‘value’]))
{

if(array_key_exists($this->getName($checkboxes[0][$a]),$_POST))
{
$replacedTag=str_replace(‘/>’,’>’,$checkboxes[0][$a]);
$html_data=str_replace($checkboxes[0][$a],$replacedTag,$html_data);
$checkboxes[0][$a]=$replacedTag;
$html_data = preg_replace(‘/’.$checkboxes[0][$a].$parsedHtmlTag[‘form_elemets’]
[$this->getName($checkboxes[0][$a])][‘value’].’/’,
$_POST[$this->getName($checkboxes[0][$a])],$html_data);
}
}
else
{
$replacedTag=str_replace(‘/>’,’>’,$checkboxes[0][$a]);
$html_data=str_replace($checkboxes[0][$a],$replacedTag,$html_data);
$checkboxes[0][$a]=$replacedTag;
$html_data = preg_replace(‘/’.$checkboxes[0][$a].$parsedHtmlTag[‘form_elemets’]
[$this->getName($checkboxes[0][$a])][‘value’].’/’,”,$html_data);
}
}
}

/*————–<input type=radio entries—————————————- */

if (preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?radio[\”‘]?([$allowedChar]{0,})[\/]?>/”,
$form, $radios) )
{

for($m =0;$m<count($radios[0]);$m++)
{
if ( preg_match(“/checked/i”, $radios[0][$m]) )
{

$parsedHtmlTag[‘form_elemets’][$this->getName($radios[0][$m])] =
array(‘type’ => ‘radio’,’value’ => $this->getValue($radios[0][$m]));

if((array_key_exists($this->getName($radios[0][$m]),$_POST)) &&
($_POST[$this->getName($radios[0][$m])] == $this->getValue($radios[0][$m])))
{
$replacedTag=str_replace(‘/>’,’>’,$radios[0][$m]);
$html_data=str_replace($radios[0][$m],$replacedTag,$html_data);
$radios[0][$m]=$replacedTag;
$html_data = preg_replace(‘/’.$radios[0][$m].”\s+”.
$parsedHtmlTag[‘form_elemets’][$this->getName($radios[0][$m])][‘value’].’/’,
$_POST[$this->getName($radios[0][$m])],$html_data);
}
else
{
$replacedTag=str_replace(‘/>’,’>’,$radios[0][$m]);
$html_data=str_replace($radios[0][$m],$replacedTag,$html_data);
$radios[0][$m]=$replacedTag;
$html_data = preg_replace(‘/’.$radios[0][$m].”\s+”.
$parsedHtmlTag[‘form_elemets’][$this->getName($radios[0][$m])][‘value’].’/’,”,
$html_data);
}

}
else
{

$parsedHtmlTag[‘form_elemets’][$this->getName($radios[0][$m])] =
array(‘type’ => ‘radio’,’value’ => $this->getValue($radios[0][$m]));
if((array_key_exists($this->getName($radios[0][$m]),$_POST)) &&
($_POST[$this->getName($radios[0][$m])] == $this->getValue($radios[0][$m])))
{
$replacedTag=str_replace(‘/>’,’>’,$radios[0][$m]);
$html_data=str_replace($radios[0][$m],$replacedTag,$html_data);
$radios[0][$m]=$replacedTag;
$html_data = preg_replace(‘/’.$radios[0][$m].”\s+”.
$parsedHtmlTag[‘form_elemets’][$this->getName($radios[0][$m])][‘value’].’/’,
$_POST[$this->getName($radios[0][$m])],$html_data);
}
else
{
$replacedTag=str_replace(‘/>’,’>’,$radios[0][$m]);
$html_data=str_replace($radios[0][$m],$replacedTag,$html_data);
$radios[0][$m]=$replacedTag;
$html_data = preg_replace(‘/’.$radios[0][$m].”\s+”.
$parsedHtmlTag[‘form_elemets’][$this->getName($radios[0][$m])][‘value’].’/’,”,
$html_data);
}
}
}
}

/*—————————–<input type=submit entries———————————*/

if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?submit[\”‘]?([$allowedChar]{0,})[\/]?>/iU”,
$form, $submits) )
{
foreach ( $submits[0] as $submit )
{
$parsedHtmlTag[‘buttons’][$this->button_counter] =
array(‘type’ => ‘submit’,’name’ =>
$this->getName($submit),’value’ => $this->getValue($submit));
$this->button_counter++;
}
}

/*—————-<input type=button entries—————————-*/

if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?button[\”‘]?([$allowedChar]{0,})[\/]?>/iU”,
$form, $buttons) )
{
foreach ( $buttons[0] as $button )
{
$parsedHtmlTag[‘buttons’][$this->button_counter] =
array(‘type’ => ‘button’,’name’ => $this->getName($button),’value’ => $this->getValue($button));
$this->button_counter++;
}
}

/*———————————– <input type=reset entries——————–*/

if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?reset[\”‘]?([$allowedChar]{0,})[\/]?>/iU”,
$form, $resets) )
{
foreach ( $resets[0] as $reset )
{
$parsedHtmlTag[‘buttons’][$this->button_counter] =
array(‘type’ => ‘reset’,’name’ => $this->getName($reset),’value’ => $this->getValue($reset));
$this->button_counter++;
}
}

/*———————————<input type=image entries————————-*/

if ( preg_match_all(“/<input([$allowedChar]{0,})type=[\”‘]?image[\”‘]?([$allowedChar]{0,})[\/]?>/iU”,
$form, $images) )
{
foreach ( $images[0] as $image )
{
$parsedHtmlTag[‘buttons’][$this->button_counter] =
array(‘type’ => ‘reset’,’name’ => $this->getName($image),’value’ => $this->getValue($image));
$this->button_counter++;
}
}

/*
* <input type=select entries
* Here I have to go on step around to grep at first all select names and then
* the content. Seems not to work in an other way
*/

if (preg_match_all(“/<select.*>.+<\/select>/isU”, $form, $selects) )
{
for($n=0;$n<count($selects[0]);$n++)
{
if ( preg_match_all(“/<option.*>.+<\/option>/isU”, $selects[0][$n], $all_options) )
{

foreach ( $all_options[0] as $option )
{
if ( preg_match(“/selected/i”, $option) )
{
if ( preg_match(“/value=[\”‘](.*)[\”‘]\s/iU”, $option, $option_value) )
{
$option_value = $option_value[1];
$found_selected = 1;
}
else
{
preg_match(“/<option.*>(.*)<\/option>/isU”, $option, $option_value);
$option_value = $option_value[1];
$found_selected = 1;
}
}
}
if ( !isset($found_selected) )
{
if ( preg_match(“/value=[\”‘](.*)[\”‘]/iU”, $all_options[0][0], $option_value) )
{
$option_value = $option_value[1];
}
else
{
preg_match(“/<option>(.*)<\/option>/iU”, $all_options[0][0], $option_value);
$option_value = $option_value[1];
}
}
else
{
unset($found_selected);
}
$parsedHtmlTag[‘form_elemets’][$this->getName($selects[0][$n])] =
array(‘type’ => ‘select’,’value’ => trim($option_value));
$html_data = str_replace(“</select>”,”<\/select>”,$html_data);
$html_data=str_replace(“\r\n”,””,$html_data);
$html_data=preg_replace(“/[\s]+/”,” “,$html_data);

$stringToReplace = $selects[0][$n];
$stringToReplace=str_replace(“</select>”,”<\/select>”,$stringToReplace);
$stringToReplace=str_replace(“\r\n”,””,$stringToReplace);
$stringToReplace=preg_replace(“/[\s]+/”,” “,$stringToReplace);

if($this->getName($stringToReplace) == ‘myDay’)
{
if($parsedHtmlTag[‘form_elemets’][$this->getName($stringToReplace)][‘value’] != ‘-1’)
{
if(array_key_exists($this->getName($stringToReplace),$_POST))
{
$replaceWith = date(“j”,mktime(0, 0, 0, 0,
$_POST[$this->getName($stringToReplace)],0));
}

}
else
{
$replaceWith = ”;
}
}
elseif($this->getName($stringToReplace) == ‘myMonth’)
{
if($parsedHtmlTag[‘form_elemets’][$this->getName($stringToReplace)][‘value’] != ‘-1’)
{
if(array_key_exists($this->getName($stringToReplace),$_POST))
{
$replaceWith = date(“F”,mktime(0, 0, 0,
$_POST[$this->getName($stringToReplace)]+1, 0, 0));
}

}
else
{
$replaceWith = ”;
}
}
else if($this->getName($stringToReplace) == ‘myYear’)
{
if(array_key_exists($this->getName($stringToReplace),$_POST))
{
if($_POST[$this->getName($stringToReplace)] != -1)
{
$replaceWith = date(“Y”,mktime(0, 0, 0, 0, 0,
$_POST[$this->getName($stringToReplace)]+1));

}
else
{
$replaceWith = ”;
}
}
else
{
$replaceWith = ”;
}
}
else
{
if(array_key_exists($this->getName($stringToReplace),$_POST))
{
$replaceWith = $_POST[$this->getName($stringToReplace)];
}
else
$replaceWith = ”;
}
$html_data=str_replace(“$stringToReplace”,$replaceWith,$html_data);
}
}
}
}
return $html_data;
}

/**
* Get Name from string
* @access private
* @param string
* @return string
*/
public function getName( $string )
{
if ( preg_match(“/name=[\”‘]?([\w\s]*)[\”‘]?[\s>]/i”, $string, $match) )
{
$val_match = preg_replace(“/\”‘/”, “”, trim($match[1]));
unset($string);
return $val_match;
}
}

/**
* Get Value from string
* @access private
* @param string
* @return string
*/
public function getValue( $string )
{
if ( preg_match(“/value=(\”([^\”]*)\”|'([^’]*)’|[^>\s]*)([^>]*)?>/is”, $string, $match) )
{
$val_match = trim($match[1]);
if ( strstr($val_match, ‘”‘) )
{
$val_match = str_replace(‘”‘, ”, $val_match);
}
unset($string);
return $val_match;
}
}

public function getTextAreaValue($string)
{
preg_match_all(“/<textarea.*>(.*)<\/textarea>/isU”, $string, $matches, PREG_SET_ORDER);
return $matches[0][1];
}
}
?>



Author: admin

Vinod Ram has been in Software Industry since 2006 and has experience of over 16 years in Software Development & Project Management domain specialised majorly in LAMP stack & Open Source Technology, building enterprise level Web based Application, Large Database driven and huge traffic Websites and Project Management. He loves to write information articles and blog to share his knowledge and experience with the outside world and help people to find solution for their problems.