i have been working on a scraper for a month and trying to scrap links(href) fetched from mysql.
i have applied as many techniques on it as possible i tried to call it with
I am recursively calling function to grab links from website and then crawl these links further.
Right now , all i got is almost (after filtering invalid links (#,javascript(void) etc) 5 to 6 hundred thousand records in approx 30 minutes. they are most likely duplicate results. if i query the distinct values from this records, i got only 50,000 records
Here is my code
function multiRequest($urls) {
global $link;
$filter_links = array();
$rolling_window = sizeof($urls);
$master = curl_multi_init();
// add additional curl options here
$std_options = array(CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_CONNECTTIMEOUT => 35,
CURLOPT_HEADER => false,
CURLOPT_TIMEOUT => 30);
$options = $std_options;
// start the first batch of requests
for ($i = 0; $i < $rolling_window; $i++) {
$ch = curl_init();
$options[CURLOPT_URL] = $urls[$i];
$options[CURLOPT_PRIVATE] = $urls[$i];
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
}
do {
while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM);
if ($execrun != CURLM_OK) {
break;
}
// a request was just completed -- find out which one
while ($done = curl_multi_info_read($master)) {
$available_curl = curl_getinfo($done['handle'], CURLINFO_PRIVATE);
$html = curl_multi_getcontent($done['handle']);
$domDoc = new DOMDocument('1.0');
@$domDoc->loadHTML($html);
$anchors = $domDoc->getElementsByTagName('a');
foreach ($anchors as $element) {
$href = $element->getAttribute('href');
$href = rtrim($href, "/");
$href = trim($href);
if ((strpos($href, '#') !== false) || $href == '' || $href == $available_curl || (strpos($href, 'javascript:') !== false) || (strpos($href, 'index.php') !== false) || preg_match('/mailto:/', $href) || (strpos($href, '.jpg') !== false) || (strpos($href, '.jpeg') !== false) || (strpos($href, '.png') !== false) ||
(strpos($href, '.gif') !== false) || (strpos($href, '.tiff') !== false) || (strpos($href, '.tif') !== false) || (strpos($href, '.pdf') !== false)) {
continue;
}
if (0 !== strpos($href, 'http')) {
$path = '/' . ltrim($href, '/');
$parts = parse_url($available_curl);
$href = $parts['scheme'] . '://';
$href .= $parts['host'];
if (isset($parts['port'])) {
$href .= ':' . $parts['port'];
}
$href .=$path;
}
$href = rtrim($href, "/");
$filter_links[] = $href;
}
$filter_links = array_unique($filter_links);
$scraped_domain = remove_http($available_curl);
$scraped_domain_key = key_domain_generator($scraped_domain);
mysqli_query($link, "UPDATE domains SET is_scraped=1, total_scraped_links = '" . count($filter_links) . "' WHERE domain_u_key = '" . $scraped_domain_key . "'") or die(mysqli_error($link));
$namecheap_filter_internal_array=extrnl_intrnl_filter($filter_links, $available_curl);
curl_multi_remove_handle($master, $done['handle']);
}
} while ($running);
curl_multi_close($master);
if (count($namecheap_filter_internal_array) > 0) {
multiRequest($namecheap_filter_internal_array);
}
}
function extrnl_intrnl_filter($href_array, $domain_link) {
global $link;
$is_external = 0;
$workers = [];
$x_count=0;
foreach ($href_array as $href) {
$href_url = parse_url($href);
$href_domain = $href_url['host'];
$key_href = giveHost($href_domain);
if (isexternal($href_domain, $domain_link) == 'External') {
$domains_Query = "select count(*) as domain_found from domains where base_url='$key_href'";
$domains_run_Query = mysqli_query($link, $domains_Query) or die(mysqli_error($link));
$domaininfo = mysqli_fetch_assoc($domains_run_Query);
if ($domaininfo['domain_found'] > 0) {
} else {
if (preg_match('/^[-a-z0-9]+\.[a-z]{2,6}$/', strtolower($key_href))) {
$is_external = 1;
if (domain_insert_check($href, $is_external)) {
echo 'prgress';
$workers[$x_count] = new WorkerThreads($href);
$workers[$x_count]->start();
$x_count++;
//exec("nohup curl --url http://37.59.1.141/tool2/index2.php?data=" . $domain_list_scrap . " > /dev/null 2> /dev/null &");
//exec("nohup php /var/www/test/tool2/index2.php " . $href . " > /dev/null 2> /dev/null &");
//exec("nohup php /var/www/test/tool2/index2.php?data=" . $href . " > /dev/null 2> /dev/null &");
//exec("nohup curl --url http://37.59.1.141/tool2/index2.php?data=" . $href . " > /dev/null 2> /dev/null &");
}
}
}
} else {
$is_external = 0;
if (domain_insert_check($href, $is_external)) {
$workers[$x_count] = new WorkerThreads($href);
$workers[$x_count]->start();
$x_count++;
$namecheap_filter_internal_array[] = $href;
}
}
}
for ($forvar=0;$forvar<$x_count;$forvar++) {
$workers[$forvar]->join();
}
return array_unique($namecheap_filter_internal_array);
}
function domain_insert_check($href, $is_external) {
global $link;
$href_url = parse_url($href);
$href_ex_https = remove_http($href);
$href_domain = $href_url['host'];
$href_scheme = $href_url['scheme'];
$key_href_i = key_domain_generator($href_ex_https);
$query = "insert into domains set domain_name = '" . addslashes($href_ex_https) . "',"
. "doamin_schema = '" . $href_scheme . "',"
. "base_url = '" . strtolower(giveHost($href_domain)) . "',"
. "domain_u_key = '" . $key_href_i . "',"
. "is_expired = '0',"
. "is_scraped = '0',"
. "is_external = '" . $is_external . "',"
. "ExtBackLinks = '0',"
. "RefDomains='0',"
. "ACRank = '0',"
. "RefIPs = '0',"
. "RefSubNets = '0',"
. "RefDomainsEDU = '0',"
. "RefDomainsGOV = '0',"
. "Title = 'title',"
. "total_scraped_links = '0',"
. "CitationFlow = '0',"
. "TrustFlow = '0',"
. "TopicalTrustFlow_Topic_0 = 'TopicalTrustFlow_Topic_0',"
. "TopicalTrustFlow_Value_0 = '0',"
. "TopicalTrustFlow_Topic_1 = 'TopicalTrustFlow_Topic_1',"
. "TopicalTrustFlow_Value_1 = '0',"
. "TopicalTrustFlow_Topic_2 = 'TopicalTrustFlow_Topic_2',"
. "TopicalTrustFlow_Value_2 = '0',"
. "date_created = '" . date('Y-m-d H:i:s') . "',"
. "user_id = 1";
$result = mysqli_query($link, $query);
if (!$result) {
mysqli_query($link, "insert into domainerror SET error = '" . $key_href_i . "' , domains= '" . $href_ex_https . "', type='fail'");
return false;
} else {
return true;
}
}
I dont really have any idea how i can optimize it so that it can grab more records then i optimize it as far as i can if i use php calls instead of curl , its going to choke mysql max connection if i use pthread , it run first time and then stops
My first suggestion would be to remove DOMDocument to replace with Regex which is far better and faster with lower memory footprint and lower time to parse.
Other smaller improvements would be to replace for example subarray search with an o(1) where possible with a hashmap.
$filter_links = array_unique($filter_links);
So instead of that, you should have a $urlMap[$urlKey] = $url; If you don't find that, then proceed to insert it in. A fast way to calculate a key, could be with md5, but there are faster methods.
Another big I/O problem from what I see is that for every website you crawl you insert in the database. Instead of doing that, you could have it separated into another array with data, and at the end to insert all the website data into your sql server.
Still, you will gain some speedup but in order to scale you will have to think of a way to split your process into multiple servers. For that you will need a queue system, you could work with RabbitMq https://www.rabbitmq.com/