How to extract all the URLs from a web page in PHP

3 Answers

0 votes
function relative_path_to_absolute_url($relative_url, $base_url) {
    if (parse_url($relative_url, PHP_URL_SCHEME) != '') { 
        return $relative_url;
    }

    if ($relative_url[0] == '#' || $relative_url[0] == '?') { 
        return $base_url.$relative_url;
    }

    extract(parse_url($base_url)); // parse $scheme, $host, $path

    $path = preg_replace('#/[^/]*$#', '', $path);

    if ($relative_url[0] == '/') { 
        $path = '';
    }

    $absolute_url = "$host$path/$relative_url";

    $arr = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
    
    for ($n = 1; $n > 0; $absolute_url = preg_replace($arr, '/', $absolute_url, -1, $n)) {}

    return $scheme.'://'.$absolute_url;
}

function get_html($url) {
    $handle = curl_init();
             
    curl_setopt($handle, CURLOPT_HTTPGET, true);
    curl_setopt($handle, CURLOPT_HEADER, true);
    curl_setopt($handle, CURLOPT_URL, $url);
    curl_setopt($handle, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($handle, CURLOPT_RETURNTRANSFER, true);
             
    $output = curl_exec($handle);
     
    curl_close($handle);
             
    $separator = "\r\n\r\n";
    $header = substr($output, 0, strpos($output, $separator));
        
    $body_start = strlen($header) + strlen($separator);
    $html = substr($output, $body_start, strlen($output) - $body_start);
         
    return $html;
}
     
     
$url = "https://www.collectivesolver.com/"; 
$base_url = "https://www.collectivesolver.com/";    
$html = get_html($url);
  
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadHTML($html);
  
$link_tags = $doc->getElementsByTagName('a');
foreach ($link_tags as $linktag) {
    if (($href = $linktag->getAttribute('href'))) {
	     $link_absolute = relative_path_to_absolute_url($href, $base_url);
         echo $link_absolute . "<br />";
    }
}


    
/*
run:
    
https://www.collectivesolver.com/
https://www.collectivesolver.com/questions
https://www.collectivesolver.com/tags
https://www.collectivesolver.com/users
https://www.collectivesolver.com/tag/python
https://www.collectivesolver.com/tag/php
https://www.collectivesolver.com/tag/c%23
https://www.collectivesolver.com/tag/java
https://www.collectivesolver.com/tag/cpp
https://www.collectivesolver.com/tag/c
https://www.collectivesolver.com/tag/javascript
https://www.collectivesolver.com/tag/vb%23
...
     
*/

 



answered Sep 14, 2019 by avibootz
edited Sep 14, 2019 by avibootz
0 votes
function relative_path_to_absolute_url($relative_url, $base_url) {
    if (parse_url($relative_url, PHP_URL_SCHEME) != '') { 
        return $relative_url;
    }

    if ($relative_url[0] == '#' || $relative_url[0] == '?') { 
        return $base_url.$relative_url;
    }

    extract(parse_url($base_url)); // parse $scheme, $host, $path

    $path = preg_replace('#/[^/]*$#', '', $path);

    if ($relative_url[0] == '/') { 
        $path = '';
    }

    $absolute_url = "$host$path/$relative_url";

    $arr = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
    
    for ($n = 1; $n > 0; $absolute_url = preg_replace($arr, '/', $absolute_url, -1, $n)) {}

    return $scheme.'://'.$absolute_url;
}

     
$url = "https://www.collectivesolver.com/"; 
$base_url = "https://www.collectivesolver.com/";    

$content = file_get_contents($url);

$tags = strip_tags($content, "<a>");
$tags_array = preg_split("/<\/a>/", $tags);

foreach ($tags_array as $a_tag) {
    if (strpos($a_tag, "<a href=") !== false ){
        $a_tag = preg_replace("/.*<a\s+href=\"/sm","", $a_tag);
        $href = preg_replace("/\".*/","", $a_tag);
        $link_absolute = relative_path_to_absolute_url($href, $base_url);
        echo $link_absolute . "<br />";
 }
}


    
/*
run:
    
https://www.collectivesolver.com/
https://www.collectivesolver.com/questions
https://www.collectivesolver.com/tags
https://www.collectivesolver.com/users
https://www.collectivesolver.com/tag/python
https://www.collectivesolver.com/tag/php
https://www.collectivesolver.com/tag/c%23
https://www.collectivesolver.com/tag/java
https://www.collectivesolver.com/tag/cpp
https://www.collectivesolver.com/tag/c
https://www.collectivesolver.com/tag/javascript
https://www.collectivesolver.com/tag/vb%23
...
     
*/

 



answered Sep 14, 2019 by avibootz
edited Sep 14, 2019 by avibootz
0 votes
function relative_path_to_absolute_url($relative_url, $base_url) {
    if (parse_url($relative_url, PHP_URL_SCHEME) != '') { 
        return $relative_url;
    }
 
    if ($relative_url[0] == '#' || $relative_url[0] == '?') { 
        return $base_url.$relative_url;
    }
 
    extract(parse_url($base_url)); // parse $scheme, $host, $path
 
    $path = preg_replace('#/[^/]*$#', '', $path);
 
    if ($relative_url[0] == '/') { 
        $path = '';
    }
 
    $absolute_url = "$host$path/$relative_url";
 
    $arr = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
     
    for ($n = 1; $n > 0; $absolute_url = preg_replace($arr, '/', $absolute_url, -1, $n)) {}
 
    return $scheme.'://'.$absolute_url;
}
 
      
$url = "https://www.collectivesolver.com/"; 
$base_url = "https://www.collectivesolver.com/";    
 
$html = file_get_contents($url);
 
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");

for ($i = 0; $i < $hrefs->length; $i++) {
    $href = $hrefs->item($i);
    $url = $href->getAttribute('href');
    $url = filter_var($url, FILTER_SANITIZE_URL);
    $link_absolute = relative_path_to_absolute_url($url, $base_url);
    
    if (!filter_var($link_absolute, FILTER_VALIDATE_URL) === false) {
         echo $link_absolute . '<br />';
    }
}



/*
run:
     
https://www.collectivesolver.com/
https://www.collectivesolver.com/questions
https://www.collectivesolver.com/tags
https://www.collectivesolver.com/users
https://www.collectivesolver.com/tag/python
https://www.collectivesolver.com/tag/php
https://www.collectivesolver.com/tag/c%23
https://www.collectivesolver.com/tag/java
https://www.collectivesolver.com/tag/cpp
https://www.collectivesolver.com/tag/c
https://www.collectivesolver.com/tag/javascript
https://www.collectivesolver.com/tag/vb%23
...
      
*/

 



answered Sep 14, 2019 by avibootz

Related questions

1 answer 189 views
1 answer 168 views
1 answer 162 views
2 answers 202 views
1 answer 226 views
1 answer 145 views
145 views asked Sep 13, 2019 by avibootz
...