Working multi-account html grabber
This commit is contained in:
		
							parent
							
								
									1a8baf57a5
								
							
						
					
					
						commit
						d54b66a317
					
				@ -1,5 +0,0 @@
 | 
			
		||||
D
 | 
			
		||||
RewriteRule ^(artist|album|loadtrack)/([a-z]+)/$ ror_proxy.php?controller=$1&action=$2
 | 
			
		||||
RewriteRule ^(artist|album|loadtrack)/([a-z]+)/([a-z0-9-\+]+)(/|\.html)?$ ror_proxy.php?controller=$1&action=$2&id=$3
 | 
			
		||||
 | 
			
		||||
RewriteRule ^$ ror_proxy.php
 | 
			
		||||
							
								
								
									
										72
									
								
								php/bin/parser/worker_html_grabber.php
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										72
									
								
								php/bin/parser/worker_html_grabber.php
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							@ -4,30 +4,72 @@
 | 
			
		||||
chdir('../..');
 | 
			
		||||
require_once 'common.php';
 | 
			
		||||
 | 
			
		||||
$db = Db::getInstance();
 | 
			
		||||
$vk = new Vkontakte();
 | 
			
		||||
// Минимальный интервал между запросами
 | 
			
		||||
define('VKTIMEOUT', 10);
 | 
			
		||||
define('QUEUE_PACK', 30);
 | 
			
		||||
define('EMPTY_QUEUE_TIMEOUT', 60);
 | 
			
		||||
 | 
			
		||||
$continue = true;
 | 
			
		||||
while ($continue) {
 | 
			
		||||
	$queue = $db->getRows($db->q("SELECT * FROM beathaven.queue WHERE status=0 OR status=2 ORDER BY priority DESC, times_failed ASC LIMIT 10"));
 | 
			
		||||
// Получаем имя бота
 | 
			
		||||
if (!isset($argv[1]) || !Config::get('bot:'. $argv[1])) {
 | 
			
		||||
	die('Wrong bot name: '. @$argv[1]);
 | 
			
		||||
}
 | 
			
		||||
$bot_name = ucfirst($argv[1]);
 | 
			
		||||
 | 
			
		||||
// Инициализация бота по имени
 | 
			
		||||
$vk = new Vkontakte($bot_name);
 | 
			
		||||
 | 
			
		||||
// Данные о работе бота
 | 
			
		||||
$stats = array(
 | 
			
		||||
	'started_job'	=> time(),
 | 
			
		||||
	'eneded_job'	=> time(),
 | 
			
		||||
	'pid'			=> getmypid(),
 | 
			
		||||
	'good_results'	=> 0,
 | 
			
		||||
	'bad_results'	=> 0,
 | 
			
		||||
	'queue_size'	=> 0,
 | 
			
		||||
	'last_request'	=> ''
 | 
			
		||||
);
 | 
			
		||||
 | 
			
		||||
// Устанавливаем коннект с БД
 | 
			
		||||
$db = Db::getInstance();
 | 
			
		||||
 | 
			
		||||
// Бот работает все время
 | 
			
		||||
while (true) {
 | 
			
		||||
	$queue = $db->getRows($db->q("SELECT * FROM beathaven.queue WHERE status=0 OR status=2 ORDER BY priority DESC, times_failed ASC LIMIT ". QUEUE_PACK));
 | 
			
		||||
	if (!$queue || count($queue) == 0) {
 | 
			
		||||
		$continue = false;
 | 
			
		||||
		sleep(EMPTY_QUEUE_TIMEOUT);
 | 
			
		||||
	} else {
 | 
			
		||||
		$stats['queue_size'] = count($queue);
 | 
			
		||||
		foreach ($queue as $t) {
 | 
			
		||||
			echo "#{$t['track_id']} {$t['track_title']}\n";
 | 
			
		||||
			$t1 = microtime(true);
 | 
			
		||||
			echo "#{$t['track_id']} {$t['track_title']} -- ";
 | 
			
		||||
			$ok = $vk->getTracks($t['track_title']);
 | 
			
		||||
			
 | 
			
		||||
			$vk->parse($t['track_title']);
 | 
			
		||||
			echo $vk->getHtml();
 | 
			
		||||
			
 | 
			
		||||
			$db->q("UPDATE beathaven.queue SET status=1 WHERE track_id=". $t['track_id']);
 | 
			
		||||
			if ($ok) {
 | 
			
		||||
				echo "OK\n";
 | 
			
		||||
				$db->q("UPDATE beathaven.queue SET status=1 WHERE track_id=". $t['track_id']);
 | 
			
		||||
				$file_name = Config::get('app:Parser:good_html_dir'). $t['track_id'] .'.html';
 | 
			
		||||
				$stats['good_results']++;
 | 
			
		||||
			} else {
 | 
			
		||||
				echo "FAILED\n\n";
 | 
			
		||||
				echo "FAILED\n";
 | 
			
		||||
				$db->q("UPDATE beathaven.queue SET status = 2, times_failed = times_failed + 1 WHERE track_id=". $t['track_id']);
 | 
			
		||||
				$file_name = Config::get('app:Parser:bad_html_dir'). $t['track_id'] .'.html';
 | 
			
		||||
				$stats['bad_results']++;
 | 
			
		||||
			}
 | 
			
		||||
			file_put_contents($file_name, $vk->getHtml());
 | 
			
		||||
			chmod($file_name, 0777);
 | 
			
		||||
			
 | 
			
		||||
			$stats['last_request'] = $t['track_title'];
 | 
			
		||||
			$stats['queue_size']--;
 | 
			
		||||
			$stats['eneded_job'] = time();
 | 
			
		||||
			
 | 
			
		||||
			$bot_stats_file_name = Config::get('app:Parser:bot_stats_dir'). $bot_name .'.json';
 | 
			
		||||
			file_put_contents($bot_stats_file_name, json_encode($stats));
 | 
			
		||||
			chmod($bot_stats_file_name, 0777);
 | 
			
		||||
			
 | 
			
		||||
			$t2 = microtime(true);
 | 
			
		||||
			if ($t2 - $t1 < 5) {
 | 
			
		||||
				sleep(ceil(5 - ($t2 - $t1)));
 | 
			
		||||
			if ($t2 - $t1 < VKTIMEOUT) {
 | 
			
		||||
				sleep(ceil(VKTIMEOUT - ($t2 - $t1)));
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
}
 | 
			
		||||
@ -3,4 +3,9 @@
 | 
			
		||||
define('ROOT_DIR', getcwd());
 | 
			
		||||
 | 
			
		||||
require_once ROOT_DIR .'/autoload.php';
 | 
			
		||||
 | 
			
		||||
// Конфиг приложения
 | 
			
		||||
Config::loadFile('app', 'config/app.ini');
 | 
			
		||||
 | 
			
		||||
// Конфиг ботов
 | 
			
		||||
Config::loadFile('bot', 'config/accounts.ini');
 | 
			
		||||
 | 
			
		||||
@ -1,23 +1,23 @@
 | 
			
		||||
[Bach]
 | 
			
		||||
user_id		= 5728795
 | 
			
		||||
email		= chezzzy@yandex.ru
 | 
			
		||||
password	= yabach!
 | 
			
		||||
remixsid	= 47c2f5501b22a3e3aa6947e5e74d1a72381267df2502570eb75c94481ade
 | 
			
		||||
remixchk	= 5
 | 
			
		||||
user_agent	= Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13
 | 
			
		||||
user_id		= "5728795"
 | 
			
		||||
email		= "chezzzy@yandex.ru"
 | 
			
		||||
password	= "yabach!"
 | 
			
		||||
remixsid	= "47c2f5501b22a3e3aa6947e5e74d1a72381267df2502570eb75c94481ade"
 | 
			
		||||
remixchk	= "5"
 | 
			
		||||
user_agent	= "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13"
 | 
			
		||||
 | 
			
		||||
[Green]
 | 
			
		||||
user_id		= 69139853
 | 
			
		||||
email		= alexgreen1978@gmail.com
 | 
			
		||||
password	= fbcn136
 | 
			
		||||
remixsid	= bc5386a4f49f8bf7df20e11bdd311a7120818d83c23d93cd08177d5d3674
 | 
			
		||||
remixchk	= 5
 | 
			
		||||
user_agent	= Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.307.9 Safari/532.9
 | 
			
		||||
user_id		= "69139853"
 | 
			
		||||
email		= "alexgreen1978@gmail.com"
 | 
			
		||||
password	= "fbcn136"
 | 
			
		||||
remixsid	= "bc5386a4f49f8bf7df20e11bdd311a7120818d83c23d93cd08177d5d3674"
 | 
			
		||||
remixchk	= "5"
 | 
			
		||||
user_agent	= "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.307.9 Safari/532.9"
 | 
			
		||||
 | 
			
		||||
[Chez]
 | 
			
		||||
user_id		= 1217744
 | 
			
		||||
email		= phpdev.ru@gmail.com
 | 
			
		||||
password	= yanebach!
 | 
			
		||||
remixsid	= fc27c3a7874bc0b84477015e187e5e0bd3a71bdca02d98327595ef255773
 | 
			
		||||
remixchk	= 5
 | 
			
		||||
user_agent	= Mozilla/5.0 (Macintosh; U; PPC Max OS X Mach-O; en-US; rv:1.8.0.7) Gecko/200609211 Camino/1.0.3
 | 
			
		||||
user_id		= "1217744"
 | 
			
		||||
email		= "phpdev.ru@gmail.com"
 | 
			
		||||
password	= "yanebach!"
 | 
			
		||||
remixsid	= "fc27c3a7874bc0b84477015e187e5e0bd3a71bdca02d98327595ef255773"
 | 
			
		||||
remixchk	= "5"
 | 
			
		||||
user_agent	= "Mozilla/5.0 (Macintosh; U; PPC Max OS X Mach-O; en-US; rv:1.8.0.7) Gecko/200609211 Camino/1.0.3"
 | 
			
		||||
@ -7,4 +7,9 @@ host    = localhost
 | 
			
		||||
port    = 5432
 | 
			
		||||
dbname	= beathaven
 | 
			
		||||
login   = postgres
 | 
			
		||||
pass    = password
 | 
			
		||||
pass    = password
 | 
			
		||||
 | 
			
		||||
[Parser]
 | 
			
		||||
good_html_dir	= "/www/parser_data/html/good/"
 | 
			
		||||
bad_html_dir	= "/www/parser_data/html/bad/"
 | 
			
		||||
bot_stats_dir	= "/www/parser_data/stats/"
 | 
			
		||||
@ -3,7 +3,7 @@
 | 
			
		||||
/*****************************************************************
 | 
			
		||||
Пример использования:
 | 
			
		||||
 | 
			
		||||
$vk_parser = new Vkontakte();
 | 
			
		||||
$vk_parser = new Vkontakte($bot_name);
 | 
			
		||||
$vk_parser->parse('Blondie - Call Me');
 | 
			
		||||
$files = $vk_parser->getFiles();
 | 
			
		||||
*****************************************************************/
 | 
			
		||||
@ -22,6 +22,12 @@ class Vkontakte {
 | 
			
		||||
	private $_html;		// HTML, полученый от вконтактика
 | 
			
		||||
	private $_files;	// Распарсеные массивы с информацией о файле
 | 
			
		||||
	
 | 
			
		||||
	private $_bot_info;	// Информация о боте
 | 
			
		||||
	
 | 
			
		||||
	public function __construct($bot_name) {
 | 
			
		||||
		$this->_bot_info = Config::get('bot:'. $bot_name);
 | 
			
		||||
	}
 | 
			
		||||
	
 | 
			
		||||
	/**
 | 
			
		||||
	 * Оболочка парсера
 | 
			
		||||
	 *
 | 
			
		||||
@ -29,7 +35,7 @@ class Vkontakte {
 | 
			
		||||
	 * @return array	Массив с файлами
 | 
			
		||||
	 * @author chez
 | 
			
		||||
	 **/
 | 
			
		||||
	public function parse($q) {
 | 
			
		||||
	public function getTracks($q) {
 | 
			
		||||
		$this->_query = $q;
 | 
			
		||||
		$this->auth();
 | 
			
		||||
		$cookie = array();
 | 
			
		||||
@ -42,18 +48,20 @@ class Vkontakte {
 | 
			
		||||
			'X-Requested-With: XMLHttpRequest',
 | 
			
		||||
			'Origin: http://vkontakte.ru',
 | 
			
		||||
			'Content-Type: application/x-www-form-urlencoded',
 | 
			
		||||
			'User-Agent: Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.151 Safari/534.16',
 | 
			
		||||
			'User-Agent: '. $this->_bot_info['user_agent'],
 | 
			
		||||
			'Connection: close'
 | 
			
		||||
		));
 | 
			
		||||
		$this->setHtml(RemoteFile::getData('http://vkontakte.ru/audio', array(
 | 
			
		||||
			'act'			=>'search',
 | 
			
		||||
			'al'			=>'1',
 | 
			
		||||
			'gid'			=>'0',
 | 
			
		||||
			'id'			=>'5728795',
 | 
			
		||||
			'offset'		=>'0',
 | 
			
		||||
		$html = RemoteFile::getData('http://vkontakte.ru/audio', array(
 | 
			
		||||
			'act'			=> 'search',
 | 
			
		||||
			'al'			=> '1',
 | 
			
		||||
			'gid'			=> '0',
 | 
			
		||||
			'id'			=> $this->_bot_info['user_id'],
 | 
			
		||||
			'offset'		=> '0',
 | 
			
		||||
			'q'				=> urlencode($this->_query),
 | 
			
		||||
			'sort'			=>'2'
 | 
			
		||||
		)));
 | 
			
		||||
			'sort'			=> '2'
 | 
			
		||||
		));
 | 
			
		||||
		$this->setHtml($html);
 | 
			
		||||
		return (strlen($html) > 150);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/**
 | 
			
		||||
@ -96,8 +104,8 @@ class Vkontakte {
 | 
			
		||||
	 **/
 | 
			
		||||
	private function auth() {
 | 
			
		||||
		$this->_cookies = array(
 | 
			
		||||
			'remixchk' => 5,
 | 
			
		||||
			'remixsid' => 'cf8bdd79d451422c1d484532a58205d92fc46b79caab663a40624c812e01',
 | 
			
		||||
			'remixchk' => $this->_bot_info['remixchk'],
 | 
			
		||||
			'remixsid' => $this->_bot_info['remixsid'],
 | 
			
		||||
			'remixlang' => 777
 | 
			
		||||
		);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
@ -29,7 +29,9 @@ class BeatDB {
 | 
			
		||||
				return false;
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		return (bool) file_put_contents($part . $path[$i], json_encode($data));
 | 
			
		||||
		file_put_contents($part . $path[$i], json_encode($data));
 | 
			
		||||
		chmod($part . $path[$i], 0777);
 | 
			
		||||
		return true;
 | 
			
		||||
	}
 | 
			
		||||
	
 | 
			
		||||
	public static function delete($key) {
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user