Crawle.php 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. <?php
  2. namespace App\Console\Commands;
  3. use App\Model\BuildInfo;
  4. use App\Model\Last;
  5. use App\Model\Links;
  6. use Illuminate\Console\Command;
  7. use Illuminate\Support\Facades\Log;
  8. use QL\QueryList;
  9. class Crawle extends Command
  10. {
  11. /**
  12. * The name and signature of the console command.
  13. *
  14. * @var string
  15. */
  16. protected $signature = 'crawle:{type}';
  17. /**
  18. * The console command description.
  19. *
  20. * @var string
  21. */
  22. protected $description = 'Command description';
  23. const DOMAIN = 'https://cd.lianjia.com';
  24. /**
  25. * Create a new command instance.
  26. *
  27. * @return void
  28. */
  29. public function __construct()
  30. {
  31. parent::__construct();
  32. }
  33. /**
  34. * Execute the console command.
  35. *
  36. * @return mixed
  37. */
  38. public function handle()
  39. {
  40. $type = $this->argument('type');
  41. $count = explode('-',$type);
  42. if(is_array($count) && count($count) == 3){
  43. $start = $count[1];
  44. $end = $count[2];
  45. }
  46. if($type == 'list'){
  47. self::getList();
  48. } else if($type == 'detail'){
  49. self::getDetail($start,$end);
  50. } else if($type == 'findlose'){
  51. self::getloseinfo();
  52. }
  53. }
  54. public static function getList()
  55. {
  56. $last = Last::where(['type'=>1])->first();
  57. $communites = Links::where(['type'=>1]);
  58. if(!empty($last) && $last->last_id !== 0 ){
  59. $communites->where('id','>=',$last->last_id);
  60. }
  61. $communites = $communites->get(['links']);
  62. foreach ($communites as $communite){
  63. if(empty($last)){
  64. Last::insert(['type'=>2,'last_id'=>$communite->id]);
  65. } else {
  66. Last::where(['type'=>2])->update(['last_id'=>$communite->id]);
  67. }
  68. for($i=1;$i<100;$i++){
  69. $areas = QueryList::get(self::DOMAIN.$communite->links.'pg'.$i,null,[
  70. 'headers' => [
  71. 'Referer'=>'http://www.baidu.com',
  72. 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
  73. 'Accept-Encoding' => 'gzip, deflate, br',
  74. ]
  75. ])->find('.xiaoquListItem .title a')->attrs('href');
  76. if(empty($areas->toArray())){
  77. Log::info('失效页数'.self::DOMAIN.$communite->links.'pg'.$i);
  78. break;
  79. }
  80. foreach ($areas as $area){
  81. $is_have = Links::where(['links'=>$area])->value('id');
  82. if($is_have) continue;
  83. Links::insert(['links'=>$area,'type'=>2]);
  84. }
  85. sleep(rand(2,6));
  86. }
  87. }
  88. // self::getDetail();
  89. dd('end');
  90. }
  91. public static function getDetail($start=0,$end=0)
  92. {
  93. $rules = [
  94. // 楼盘名称
  95. 'build_name' => ['.detailTitle','text'],
  96. //市区
  97. 'district' => ['.l-txt a:eq(2)','text'],
  98. //社区
  99. 'community' => ['.l-txt a:eq(3)','text'],
  100. //简单地址
  101. 'sample_address' => ['.detailDesc','text'],
  102. //小区价格
  103. 'price'=>['.xiaoquUnitPrice','text'],
  104. //小区图片
  105. 'images' => ['.imgThumbnailList img','src'],
  106. //建成年份
  107. 'completed' => ['.xiaoquInfo .xiaoquInfoItem:eq(0) span:eq(1)','text'],
  108. //建筑结构
  109. 'structure_type' => ['.xiaoquInfo .xiaoquInfoItem:eq(1) span:eq(1)','text'],
  110. //物业公司
  111. 'tenement' => ['.xiaoquInfo .xiaoquInfoItem:eq(3) span:eq(1)','text'],
  112. //栋数
  113. 'bulid_num' => ['.xiaoquInfo .xiaoquInfoItem:eq(5) span:eq(1)','text'],
  114. //户数
  115. 'households' => ['.xiaoquInfo .xiaoquInfoItem:eq(6) span:eq(1)','text'],
  116. ];
  117. if($start > $end){
  118. dd('最后的数必须大于前面的数字');
  119. }
  120. if($end==5000){
  121. $type = 2;
  122. } else if($end == 10000) {
  123. $type = 3;
  124. } else if($end == 15200){
  125. $type = 4;
  126. } else {
  127. dd('位置类型');
  128. }
  129. $last = Last::where(['type'=>$type])->first();
  130. // dd($last->toArray());
  131. if(!empty($last) && $last->last_id !== 0 ){
  132. $start = $last->last_id ;
  133. }
  134. $link_info = Links::whereBetween('id',[$start,$end]);
  135. $link_info->where(['type'=>2])->chunk(100,function ($links) use ($rules,$last,$type){
  136. dd($links->toArray());
  137. try{
  138. foreach ($links as $link){
  139. echo '爬取详情链接'.$link->links.PHP_EOL;
  140. $html = QueryList::get($link->links,null,[
  141. 'headers' => [
  142. 'Referer'=>'http://www.baidu.com',
  143. 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
  144. 'Accept-Encoding' => 'gzip, deflate, br',
  145. ]
  146. ]);
  147. $build = $html->rules($rules)->query()->getData();
  148. $images = $html->find('.imgThumbnailList img')->attrs('src');
  149. sleep(rand(3,10));
  150. if(!empty($images)) $build['images'] = json_encode($images);
  151. if(empty($build->toArray())) continue;
  152. (!empty($build['bulid_num'])) ? $build['bulid_num'] = intval($build['bulid_num']) : $build['bulid_num'] = 0;
  153. (!empty($build['completed'])) ? $build['completed'] = intval($build['completed']) : $build['completed'] = 0;
  154. (!empty($build['price'])) ? $build['price'] = intval($build['price']) : $build['price'] = 0;
  155. if(empty($build['households'])) continue;
  156. if(!empty($build['households'])) $build['households'] = intval($build['households']);
  157. $is_have = BuildInfo::where(['build_name'=>$build['build_name']])->value('id');
  158. if($is_have){continue;}
  159. if(empty($last)){
  160. Last::insert(['type'=>$type,'last_id'=>$link->id]);
  161. } else {
  162. Last::where(['type'=>$type])->update(['last_id'=>$link->id]);
  163. }
  164. $build['link_id'] = $link->id;
  165. BuildInfo::insert($build->toArray());
  166. }
  167. } catch(Exception $e) {
  168. Log::info('爬取详情链接'.$link->links.PHP_EOL.'报错');
  169. dd($e->getMessage());
  170. }
  171. });
  172. dd('end');
  173. }
  174. public function getloseinfo(){
  175. $rules = [
  176. //小区价格
  177. 'price'=>['.xiaoquUnitPrice','text'],
  178. ];
  179. BuildInfo::where('price','=',null)->chunk(100,function ($builds) use ($rules){
  180. try{
  181. foreach ($builds as $build){
  182. sleep(rand(3,10));
  183. $linkInfo = Links::where(['id'=>$build->link_id])->first();
  184. echo '爬取详情链接'.$linkInfo->links.PHP_EOL;
  185. $html = QueryList::get($linkInfo->links,null,[
  186. 'headers' => [
  187. 'Referer'=>'http://www.baidu.com',
  188. 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
  189. 'Accept-Encoding' => 'gzip, deflate, br',
  190. ]
  191. ]);
  192. $price = $html->find('.xiaoquUnitPrice')->text();
  193. if(empty($price)){
  194. echo '没有价格'.$build->id;
  195. continue;
  196. }
  197. BuildInfo::where(['id'=>$build->id])->update(['price'=>$price]);
  198. }
  199. } catch(Exception $e) {
  200. dd('爬取详情链接'.$linkInfo->links.PHP_EOL.'报错'.$e->getMessage());
  201. }
  202. });
  203. }
  204. }