1
0
Fork 0

Working multi-account html grabber

This commit is contained in:
magnolia-fan 2011-04-02 20:44:45 +04:00
parent 1a8baf57a5
commit d54b66a317
7 changed files with 110 additions and 53 deletions

View File

@ -1,5 +0,0 @@
D
RewriteRule ^(artist|album|loadtrack)/([a-z]+)/$ ror_proxy.php?controller=$1&action=$2
RewriteRule ^(artist|album|loadtrack)/([a-z]+)/([a-z0-9-\+]+)(/|\.html)?$ ror_proxy.php?controller=$1&action=$2&id=$3
RewriteRule ^$ ror_proxy.php

66
php/bin/parser/worker_html_grabber.php Normal file → Executable file
View File

@ -4,29 +4,71 @@
chdir('../..');
require_once 'common.php';
// Минимальный интервал между запросами
define('VKTIMEOUT', 10);
define('QUEUE_PACK', 30);
define('EMPTY_QUEUE_TIMEOUT', 60);
// Получаем имя бота
if (!isset($argv[1]) || !Config::get('bot:'. $argv[1])) {
die('Wrong bot name: '. @$argv[1]);
}
$bot_name = ucfirst($argv[1]);
// Инициализация бота по имени
$vk = new Vkontakte($bot_name);
// Данные о работе бота
$stats = array(
'started_job' => time(),
'eneded_job' => time(),
'pid' => getmypid(),
'good_results' => 0,
'bad_results' => 0,
'queue_size' => 0,
'last_request' => ''
);
// Устанавливаем коннект с БД
$db = Db::getInstance();
$vk = new Vkontakte();
$continue = true;
while ($continue) {
$queue = $db->getRows($db->q("SELECT * FROM beathaven.queue WHERE status=0 OR status=2 ORDER BY priority DESC, times_failed ASC LIMIT 10"));
// Бот работает все время
while (true) {
$queue = $db->getRows($db->q("SELECT * FROM beathaven.queue WHERE status=0 OR status=2 ORDER BY priority DESC, times_failed ASC LIMIT ". QUEUE_PACK));
if (!$queue || count($queue) == 0) {
$continue = false;
sleep(EMPTY_QUEUE_TIMEOUT);
} else {
$stats['queue_size'] = count($queue);
foreach ($queue as $t) {
echo "#{$t['track_id']} {$t['track_title']}\n";
$vk->parse($t['track_title']);
echo $vk->getHtml();
$t1 = microtime(true);
echo "#{$t['track_id']} {$t['track_title']} -- ";
$ok = $vk->getTracks($t['track_title']);
if ($ok) {
echo "OK\n";
$db->q("UPDATE beathaven.queue SET status=1 WHERE track_id=". $t['track_id']);
$file_name = Config::get('app:Parser:good_html_dir'). $t['track_id'] .'.html';
$stats['good_results']++;
} else {
echo "FAILED\n\n";
echo "FAILED\n";
$db->q("UPDATE beathaven.queue SET status = 2, times_failed = times_failed + 1 WHERE track_id=". $t['track_id']);
$file_name = Config::get('app:Parser:bad_html_dir'). $t['track_id'] .'.html';
$stats['bad_results']++;
}
file_put_contents($file_name, $vk->getHtml());
chmod($file_name, 0777);
$stats['last_request'] = $t['track_title'];
$stats['queue_size']--;
$stats['eneded_job'] = time();
$bot_stats_file_name = Config::get('app:Parser:bot_stats_dir'). $bot_name .'.json';
file_put_contents($bot_stats_file_name, json_encode($stats));
chmod($bot_stats_file_name, 0777);
$t2 = microtime(true);
if ($t2 - $t1 < 5) {
sleep(ceil(5 - ($t2 - $t1)));
if ($t2 - $t1 < VKTIMEOUT) {
sleep(ceil(VKTIMEOUT - ($t2 - $t1)));
}
}
}

View File

@ -3,4 +3,9 @@
define('ROOT_DIR', getcwd());
require_once ROOT_DIR .'/autoload.php';
// Конфиг приложения
Config::loadFile('app', 'config/app.ini');
// Конфиг ботов
Config::loadFile('bot', 'config/accounts.ini');

View File

@ -1,23 +1,23 @@
[Bach]
user_id = 5728795
email = chezzzy@yandex.ru
password = yabach!
remixsid = 47c2f5501b22a3e3aa6947e5e74d1a72381267df2502570eb75c94481ade
remixchk = 5
user_agent = Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13
user_id = "5728795"
email = "chezzzy@yandex.ru"
password = "yabach!"
remixsid = "47c2f5501b22a3e3aa6947e5e74d1a72381267df2502570eb75c94481ade"
remixchk = "5"
user_agent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13"
[Green]
user_id = 69139853
email = alexgreen1978@gmail.com
password = fbcn136
remixsid = bc5386a4f49f8bf7df20e11bdd311a7120818d83c23d93cd08177d5d3674
remixchk = 5
user_agent = Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.307.9 Safari/532.9
user_id = "69139853"
email = "alexgreen1978@gmail.com"
password = "fbcn136"
remixsid = "bc5386a4f49f8bf7df20e11bdd311a7120818d83c23d93cd08177d5d3674"
remixchk = "5"
user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.307.9 Safari/532.9"
[Chez]
user_id = 1217744
email = phpdev.ru@gmail.com
password = yanebach!
remixsid = fc27c3a7874bc0b84477015e187e5e0bd3a71bdca02d98327595ef255773
remixchk = 5
user_agent = Mozilla/5.0 (Macintosh; U; PPC Max OS X Mach-O; en-US; rv:1.8.0.7) Gecko/200609211 Camino/1.0.3
user_id = "1217744"
email = "phpdev.ru@gmail.com"
password = "yanebach!"
remixsid = "fc27c3a7874bc0b84477015e187e5e0bd3a71bdca02d98327595ef255773"
remixchk = "5"
user_agent = "Mozilla/5.0 (Macintosh; U; PPC Max OS X Mach-O; en-US; rv:1.8.0.7) Gecko/200609211 Camino/1.0.3"

View File

@ -8,3 +8,8 @@ port = 5432
dbname = beathaven
login = postgres
pass = password
[Parser]
good_html_dir = "/www/parser_data/html/good/"
bad_html_dir = "/www/parser_data/html/bad/"
bot_stats_dir = "/www/parser_data/stats/"

View File

@ -3,7 +3,7 @@
/*****************************************************************
Пример использования:
$vk_parser = new Vkontakte();
$vk_parser = new Vkontakte($bot_name);
$vk_parser->parse('Blondie - Call Me');
$files = $vk_parser->getFiles();
*****************************************************************/
@ -22,6 +22,12 @@ class Vkontakte {
private $_html; // HTML, полученый от вконтактика
private $_files; // Распарсеные массивы с информацией о файле
private $_bot_info; // Информация о боте
public function __construct($bot_name) {
$this->_bot_info = Config::get('bot:'. $bot_name);
}
/**
* Оболочка парсера
*
@ -29,7 +35,7 @@ class Vkontakte {
* @return array Массив с файлами
* @author chez
**/
public function parse($q) {
public function getTracks($q) {
$this->_query = $q;
$this->auth();
$cookie = array();
@ -42,18 +48,20 @@ class Vkontakte {
'X-Requested-With: XMLHttpRequest',
'Origin: http://vkontakte.ru',
'Content-Type: application/x-www-form-urlencoded',
'User-Agent: Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.151 Safari/534.16',
'User-Agent: '. $this->_bot_info['user_agent'],
'Connection: close'
));
$this->setHtml(RemoteFile::getData('http://vkontakte.ru/audio', array(
$html = RemoteFile::getData('http://vkontakte.ru/audio', array(
'act' => 'search',
'al' => '1',
'gid' => '0',
'id' =>'5728795',
'id' => $this->_bot_info['user_id'],
'offset' => '0',
'q' => urlencode($this->_query),
'sort' => '2'
)));
));
$this->setHtml($html);
return (strlen($html) > 150);
}
/**
@ -96,8 +104,8 @@ class Vkontakte {
**/
private function auth() {
$this->_cookies = array(
'remixchk' => 5,
'remixsid' => 'cf8bdd79d451422c1d484532a58205d92fc46b79caab663a40624c812e01',
'remixchk' => $this->_bot_info['remixchk'],
'remixsid' => $this->_bot_info['remixsid'],
'remixlang' => 777
);
}

View File

@ -29,7 +29,9 @@ class BeatDB {
return false;
}
}
return (bool) file_put_contents($part . $path[$i], json_encode($data));
file_put_contents($part . $path[$i], json_encode($data));
chmod($part . $path[$i], 0777);
return true;
}
public static function delete($key) {