| xqd
@@ -0,0 +1,261 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+import scrapy
|
|
|
+import json
|
|
|
+import pymysql.cursors
|
|
|
+import time
|
|
|
+import sys
|
|
|
+import random
|
|
|
+
|
|
|
+reload(sys)
|
|
|
+sys.setdefaultencoding('utf8')
|
|
|
+
|
|
|
+
|
|
|
+class LixiaosknbSpider(scrapy.Spider):
|
|
|
+ name = 'lixiaosknb'
|
|
|
+ allowed_domains = ['biz.lixiaoskb.com']
|
|
|
+ start_urls = ['https://biz.lixiaoskb.com/login']
|
|
|
+
|
|
|
+ headers = {
|
|
|
+ 'Connection': 'keep-alive', # 保持链接状态
|
|
|
+ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
|
|
|
+ 'Content-Type': 'application/json'
|
|
|
+ }
|
|
|
+
|
|
|
+ global company_id
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.file = open('items.json', 'wb')
|
|
|
+ self.connect = pymysql.connect(host='localhost',
|
|
|
+ user='root',
|
|
|
+ password='root',
|
|
|
+ db='swdz_crm',
|
|
|
+ charset='utf8mb4',
|
|
|
+ cursorclass=pymysql.cursors.DictCursor)
|
|
|
+
|
|
|
+ self.cursor = self.connect.cursor()
|
|
|
+
|
|
|
+ def parse(self, response):
|
|
|
+ yield scrapy.FormRequest(
|
|
|
+ url='https://biz.lixiaoskb.com/api/user/login',
|
|
|
+ formdata={'username': '17381599246', 'password': 'yj870102722'},
|
|
|
+ callback=self.after_login
|
|
|
+ )
|
|
|
+
|
|
|
+ def after_login(self, response):
|
|
|
+ res = json.loads(response.body)
|
|
|
+ print(res['data'])
|
|
|
+ token = res['data']['token']
|
|
|
+ self.headers['Authorization'] = token.encode('utf-8')
|
|
|
+
|
|
|
+ for i in range(1, 2):
|
|
|
+ myFormData = {
|
|
|
+ "keyword": "",
|
|
|
+ "filter": "{\"location\":[\"5101\"],\"industryshort\":[],\"registercapital\":\"0\",\"establishment\":\"0\",\"entstatus\":\"0\",\"enttype\":\"0\"}",
|
|
|
+ "scope": "",
|
|
|
+ "pagesize": "50",
|
|
|
+ "page": i
|
|
|
+ }
|
|
|
+ print(i)
|
|
|
+ if i % 10 == 0:
|
|
|
+ time.sleep(random.randint(5, 30))
|
|
|
+
|
|
|
+ time.sleep(0.5)
|
|
|
+ yield scrapy.Request('https://biz.lixiaoskb.com/api/opensearch/search',
|
|
|
+ method="POST",
|
|
|
+ body=json.dumps(myFormData),
|
|
|
+ headers=self.headers,
|
|
|
+ callback=self.parse_item)
|
|
|
+
|
|
|
+ def parse_item(self, response):
|
|
|
+ body = json.loads(response.body)
|
|
|
+ items = body['data']['items']
|
|
|
+ i = 0
|
|
|
+ for item in items:
|
|
|
+ i = i+1
|
|
|
+ id = item['id']
|
|
|
+ market_company = item['value']
|
|
|
+
|
|
|
+ time.sleep(random.random())
|
|
|
+ if i % 10 == 0:
|
|
|
+ time.sleep(random.randint(5, 10))
|
|
|
+
|
|
|
+ # 获取基本信息
|
|
|
+ self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + market_company + '&source=search'
|
|
|
+ yield scrapy.Request(
|
|
|
+ 'https://biz.lixiaoskb.com/api/opensearch/marketReport?id=' + id + '&market_company=' + market_company + '&market_source=search',
|
|
|
+ headers=self.headers,
|
|
|
+ meta={'id': id, 'name': market_company},
|
|
|
+ callback=self.get_baseinfo
|
|
|
+ )
|
|
|
+
|
|
|
+ # 获取基本信息
|
|
|
+ def get_baseinfo(self, response):
|
|
|
+ body = json.loads(response.body)
|
|
|
+ id = response.meta['id']
|
|
|
+ name = response.meta['name']
|
|
|
+ data = body['data']['baseinfo']
|
|
|
+
|
|
|
+ annual_date = self.format_time(data['apprdate'])
|
|
|
+ start_date = self.format_time(data['esdate'])
|
|
|
+ opfrom = data['opfrom'] if data.has_key('opfrom') else ''
|
|
|
+ opto = data['opto'] if data.has_key('opto') else ''
|
|
|
+ officialWebsite = data['officialWebsite'] if data.has_key('officialWebsite') else ''
|
|
|
+ regno = data['regno'] if data.has_key('regno') else ''
|
|
|
+ legalperson = data['legalperson'] if data.has_key('legalperson') else ''
|
|
|
+ address = data['address'] if data.has_key('address') else ''
|
|
|
+ businessscope = data['businessscope'] if data.has_key('businessscope') else ''
|
|
|
+ entstatus = data['entstatus'] if data.has_key('entstatus') else ''
|
|
|
+ enttype = data['enttype'] if data.has_key('enttype') else ''
|
|
|
+ industry = data['industry'] if data.has_key('industry') else ''
|
|
|
+ regcap = data['regcap'] if data.has_key('regcap') else ''
|
|
|
+
|
|
|
+ opentime = self.format_time(opfrom) + '-' + self.format_time(opto)
|
|
|
+ self.cursor.execute(
|
|
|
+ """insert into company_info(company_name, reg_no, legal_person, reg_addr, scope, open_status, ent_type, industry, annual_date,start_date,open_time,reg_capital,website)
|
|
|
+ value (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
|
|
|
+ (data['entname'], regno, legalperson, address, businessscope,
|
|
|
+ entstatus, enttype, industry, annual_date, start_date, opentime, regcap,
|
|
|
+ officialWebsite))
|
|
|
+ # 提交sql语句
|
|
|
+ self.connect.commit()
|
|
|
+ company_id = self.cursor.lastrowid
|
|
|
+
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+ self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
|
|
|
+ # 获取旗下网站信息
|
|
|
+ yield scrapy.Request(
|
|
|
+ 'https://biz.lixiaoskb.com/api/opensearch/getDomains?id=' + id,
|
|
|
+ headers=self.headers,
|
|
|
+ meta={'company_id': company_id},
|
|
|
+ callback=self.get_webs
|
|
|
+ )
|
|
|
+
|
|
|
+ time.sleep(0.5)
|
|
|
+ # 获取股东信息
|
|
|
+ yield scrapy.Request(
|
|
|
+ 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=3',
|
|
|
+ headers=self.headers,
|
|
|
+ meta={'company_id': company_id},
|
|
|
+ callback=self.get_shareholders
|
|
|
+ )
|
|
|
+
|
|
|
+ time.sleep(0.5)
|
|
|
+ # 获取招聘
|
|
|
+ yield scrapy.Request(
|
|
|
+ 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=4',
|
|
|
+ headers=self.headers,
|
|
|
+ meta={'company_id': company_id, 'offset': 1, 'id': id, 'name': name},
|
|
|
+ callback=self.get_jobs
|
|
|
+ )
|
|
|
+
|
|
|
+ time.sleep(0.5)
|
|
|
+ # 获取网络推广
|
|
|
+ yield scrapy.Request(
|
|
|
+ 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=5',
|
|
|
+ headers=self.headers,
|
|
|
+ meta={'company_id': company_id, 'offset': 1, 'id': id, 'name': name},
|
|
|
+ callback=self.get_sem
|
|
|
+ )
|
|
|
+
|
|
|
+ # 获取网络推广
|
|
|
+ def get_sem(self, response):
|
|
|
+ body = json.loads(response.body)
|
|
|
+ company_id = response.meta['company_id']
|
|
|
+ offset = response.meta['offset']
|
|
|
+ id = response.meta['id']
|
|
|
+ name = response.meta['name']
|
|
|
+ items = body['data']['item']
|
|
|
+ if items:
|
|
|
+ for item in items:
|
|
|
+ keywords = ','.join(item['semKeywords'])
|
|
|
+ self.cursor.execute(
|
|
|
+ """insert into company_sems(company_id, sem_date, sem_title, sem_url, keywords, source_name)
|
|
|
+ value (%s, %s, %s, %s, %s, %s)""",
|
|
|
+ (company_id, item['semDate'], item['semTitle'], item['semUrl'], keywords, item['sourceName'],
|
|
|
+ ))
|
|
|
+ # 提交sql语句
|
|
|
+ self.connect.commit()
|
|
|
+
|
|
|
+ time.sleep(1)
|
|
|
+ offset = offset + 1
|
|
|
+ if offset % 5 == 0:
|
|
|
+ time.sleep(5)
|
|
|
+
|
|
|
+ self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
|
|
|
+ yield scrapy.Request(
|
|
|
+ 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + str(
|
|
|
+ id) + '&tag=5' + '&offset=' + str(offset),
|
|
|
+ headers=self.headers,
|
|
|
+ meta={'company_id': company_id, 'offset': offset, 'id': id, 'name':name},
|
|
|
+ callback=self.get_sem
|
|
|
+ )
|
|
|
+
|
|
|
+ # 获取旗下网站信息
|
|
|
+ def get_webs(self, response):
|
|
|
+ body = json.loads(response.body)
|
|
|
+ company_id = response.meta['company_id']
|
|
|
+ items = body['data']['items']
|
|
|
+ if items:
|
|
|
+ for item in items:
|
|
|
+ self.cursor.execute(
|
|
|
+ """insert into company_domains(company_id, site_domain, site_home, site_name)
|
|
|
+ value (%s, %s, %s, %s)""",
|
|
|
+ (company_id, item['SITEDOMAIN'], item['SITEHOME'], item['SITENAME']))
|
|
|
+ # 提交sql语句
|
|
|
+ self.connect.commit()
|
|
|
+
|
|
|
+ # 获取股东信息
|
|
|
+ def get_shareholders(self, response):
|
|
|
+ body = json.loads(response.body)
|
|
|
+ company_id = response.meta['company_id']
|
|
|
+ if body['data'].has_key('BE_INVEST'):
|
|
|
+ items = body['data']['BE_INVEST']['data']
|
|
|
+ if items:
|
|
|
+ for item in items:
|
|
|
+ self.cursor.execute(
|
|
|
+ """insert into company_shareholders(company_id, username, insto, amount, amount_paid)
|
|
|
+ value (%s, %s, %s, %s, %s)""",
|
|
|
+ (company_id, item['INV'], item['INSTO'], item['LIACCONAM'], item['LISUBCONAM']))
|
|
|
+ # 提交sql语句
|
|
|
+ self.connect.commit()
|
|
|
+
|
|
|
+ # 获取招聘
|
|
|
+ def get_jobs(self, response):
|
|
|
+ body = json.loads(response.body)
|
|
|
+ company_id = response.meta['company_id']
|
|
|
+ offset = response.meta['offset']
|
|
|
+ id = response.meta['id']
|
|
|
+ name = response.meta['name']
|
|
|
+ items = body['data']['item']
|
|
|
+ if items:
|
|
|
+ for item in items:
|
|
|
+ self.cursor.execute(
|
|
|
+ """insert into company_jobs(company_id, job_name, location, salary, job_url, release_date, source_name)
|
|
|
+ value (%s, %s, %s, %s, %s, %s, %s)""",
|
|
|
+ (company_id, item['jobName'], item['location'], item['salary'], item['jobUrl'], item['releaseDate'],
|
|
|
+ item['sourceName']))
|
|
|
+ # 提交sql语句
|
|
|
+ self.connect.commit()
|
|
|
+
|
|
|
+ time.sleep(1)
|
|
|
+ offset = offset + 1
|
|
|
+ if offset % 5 == 0:
|
|
|
+ time.sleep(5)
|
|
|
+
|
|
|
+ self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
|
|
|
+ yield scrapy.Request(
|
|
|
+ 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + str(
|
|
|
+ id) + '&tag=4' + '&offset=' + str(offset),
|
|
|
+ headers=self.headers,
|
|
|
+ meta={'company_id': company_id, 'offset': offset, 'id': id, 'name': name},
|
|
|
+ callback=self.get_jobs
|
|
|
+ )
|
|
|
+
|
|
|
+ # 将时间格式化为时间戳
|
|
|
+ def format_time(self, mytime):
|
|
|
+ if mytime:
|
|
|
+ timeStamp = float(mytime / 1000)
|
|
|
+ timeArray = time.localtime(timeStamp)
|
|
|
+ return time.strftime("%Y-%m-%d", timeArray)
|
|
|
+ return ''
|