$html = file_get_contents("any site");
$dom = new domDocument;
@$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$images = $dom->getElementsByTagName('img');
foreach ($images as $image) {
echo $image->src;
}
return me nothing
$html = file_get_contents("any site");
$dom = new domDocument;
@$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$images = $dom->getElementsByTagName('img');
foreach ($images as $image) {
echo $image->getAttribute('src');
}
return me relative url like "/images/example.jpg
$html = file_get_contents("any site");
$dom = new domDocument;
@$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$images = $dom->getElementsByTagName('img');
foreach ($images as $image) {
echo $image.src;
}
return me:
Fatal error: Call to undefined function getElementsByTagName()
So, how can I get the absolute path ?
I think you should combine your 2nd solution with the URL of 'any site'
. Because the src tag of the image may contain only relative paths. From the point of the web developer, there is no need to include the absolute path.
You can use parse_url to find the base URL:
$url = 'http://www.example.com/path?opt=234';
$parts = parse_url($url);
if (isset($parts['scheme'])){
$base_url = $parts['scheme'].'://';
} else {
$base_url = 'http://';
$parts = parse_url($base_url.$url);
}
$base_url .= $parts['host'];
if (isset($parts['path'])){
$base_url .= $parts['path'];
}
And then combine it with your code as follows:
$html = file_get_contents("any site");
$dom = new domDocument;
@$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$images = $dom->getElementsByTagName('img');
foreach ($images as $image) {
echo $base_url.$image->getAttribute('src');
}
This code differentiates between src
attributes with a relative URL and a full URL. It's a bit more robust than simple string concatenation and handles cases where the relative path doesn't begin with a slash. e.g. images/image.jpg
vs. /images/image.jpg
.
<?php
$site = 'http://example.com/some/deeply/buried/page.html';
$dir = dirname($site);
$html = file_get_contents($site);
$dom = new domDocument;
@$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$images = $dom->getElementsByTagName('img');
foreach ($images as $image) {
// get the img src attribute
$img_path = $image->getAttribute('src');
// parse the path into its constituent parts
$url_info = parse_url($img_path);
// if the host part (or indeed any part other than "path") is set,
// then we're dealing with a fully qualified URL (or possibly an error)
if (!isset($url_info['host'])) {
// otherwise, get the relative path
$path = $url_info['path'];
// and ensure it begins with a slash
if (substr($path,0,1) !== '/') $path = '/'.$path;
// concatenate the site directory with the relative path
$img_path = $dir.$path;
}
echo $img_path; // this should be a full URL
}
?>
its working for me, try it too
<?php
echo path_to_absolute(
"../images/example.jpg", /* image url */
"http://php.net/manual/en/" /* current page url */,
false /* is your url containing file name at the end like "http://server.com/file.html" */
);
function path_to_absolute( $src, $base = null, $has_filename = false ) {
if ( $has_filename && !in_array( substr( $src, 0, 1 ), array( "?", "#" ) ) ) {
$base = dirname( $base )."/";
}
else {
$base = rtrim( $base, "/" )."/";
}
if ( parse_url( $src, PHP_URL_HOST ) ) {
/* Its full url, so return it without modifying */
return $src;
}
if ( substr( $src, 0, 1 ) == "/" ) {
/* $src begin with a slash, find server host and, join it with $src */
return str_replace( parse_url( $base, PHP_URL_PATH ), "", $base ).$src;
}
/* remove './' from $src, we dont need it */
$src = ( substr( $src, 0, 2 ) === "./" ) ? substr( $src, 2, strlen( $src ) ) : $src;
/* check how many times we need to go back **/
$path = substr_count( $src, "../" );
$src = str_ireplace( "../", "", $src );
for( $i = 1; $i <= $path; $i++ ) {
if ( parse_url( dirname( $base ), PHP_URL_HOST ) ) {
$base = dirname( $base ) . "/";
}
}
return $base . $src;
}
?>
example usage..
here we finding links from php.net
as there are so many relative links
<?php
$url = "http://www.php.net/manual/en/tokens.php";
$html = file_get_contents( $url );
$dom = new DOMDocument;
@$dom->loadHTML( $html );
$dom->preserveWhiteSpace = false;
$links = $dom->getElementsByTagName( 'a' );
foreach( $links as $link ) {
$original_url = $link->getAttribute( 'href' );
$absolute_url = path_to_absolute( $original_url, $url, true );
echo $original_url." - ".$absolute_url."
";
}
/** prints...
* / - http://www.php.net/
* ...
* control-structures.while.php - http://www.php.net/manual/en/control-structures.while.php
* control-structures.do.while.php - http://www.php.net/manual/en/control-structures.do.while.php
* ...
* /sitemap.php - http://www.php.net/sitemap.php
* /contact.php - http://www.php.net/contact.php
* ...
* http://developer.yahoo.com/ - http://developer.yahoo.com/
* ...
* ?setbeta=1&beta=1 - http://www.php.net/manual/en/tokens.php?setbeta=1&beta=1
* ...
* #85872 - http://www.php.net/manual/en/tokens.php#85872
**/
?>