# -*- coding: utf-8 -*- import scrapy import json import pymysql.cursors import time import sys import random reload(sys) sys.setdefaultencoding('utf8') class LixiaosknbSpider(scrapy.Spider): name = 'lixiaosknb' allowed_domains = ['biz.lixiaoskb.com'] start_urls = ['https://biz.lixiaoskb.com/login'] headers = { 'Connection': 'keep-alive', # 保持链接状态 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36', 'Content-Type': 'application/json' } global company_id def __init__(self): self.file = open('items.json', 'wb') self.connect = pymysql.connect(host='localhost', user='root', password='root', db='swdz_crm', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) self.cursor = self.connect.cursor() def parse(self, response): yield scrapy.FormRequest( url='https://biz.lixiaoskb.com/api/user/login', formdata={'username': '17381599246', 'password': 'yj870102722'}, callback=self.after_login ) def after_login(self, response): res = json.loads(response.body) print(res['data']) token = res['data']['token'] self.headers['Authorization'] = token.encode('utf-8') for i in range(1, 2): myFormData = { "keyword": "", "filter": "{\"location\":[\"5101\"],\"industryshort\":[],\"registercapital\":\"0\",\"establishment\":\"0\",\"entstatus\":\"0\",\"enttype\":\"0\"}", "scope": "", "pagesize": "50", "page": i } print(i) if i % 10 == 0: time.sleep(random.randint(5, 30)) time.sleep(0.5) yield scrapy.Request('https://biz.lixiaoskb.com/api/opensearch/search', method="POST", body=json.dumps(myFormData), headers=self.headers, callback=self.parse_item) def parse_item(self, response): body = json.loads(response.body) items = body['data']['items'] i = 0 for item in items: i = i+1 id = item['id'] market_company = item['value'] time.sleep(random.random()) if i % 10 == 0: time.sleep(random.randint(5, 10)) # 获取基本信息 self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + market_company + '&source=search' yield scrapy.Request( 'https://biz.lixiaoskb.com/api/opensearch/marketReport?id=' + id + '&market_company=' + market_company + '&market_source=search', headers=self.headers, meta={'id': id, 'name': market_company}, callback=self.get_baseinfo ) # 获取基本信息 def get_baseinfo(self, response): body = json.loads(response.body) id = response.meta['id'] name = response.meta['name'] data = body['data']['baseinfo'] annual_date = self.format_time(data['apprdate']) start_date = self.format_time(data['esdate']) opfrom = data['opfrom'] if data.has_key('opfrom') else '' opto = data['opto'] if data.has_key('opto') else '' officialWebsite = data['officialWebsite'] if data.has_key('officialWebsite') else '' regno = data['regno'] if data.has_key('regno') else '' legalperson = data['legalperson'] if data.has_key('legalperson') else '' address = data['address'] if data.has_key('address') else '' businessscope = data['businessscope'] if data.has_key('businessscope') else '' entstatus = data['entstatus'] if data.has_key('entstatus') else '' enttype = data['enttype'] if data.has_key('enttype') else '' industry = data['industry'] if data.has_key('industry') else '' regcap = data['regcap'] if data.has_key('regcap') else '' opentime = self.format_time(opfrom) + '-' + self.format_time(opto) self.cursor.execute( """insert into company_info(company_name, reg_no, legal_person, reg_addr, scope, open_status, ent_type, industry, annual_date,start_date,open_time,reg_capital,website) value (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", (data['entname'], regno, legalperson, address, businessscope, entstatus, enttype, industry, annual_date, start_date, opentime, regcap, officialWebsite)) # 提交sql语句 self.connect.commit() company_id = self.cursor.lastrowid time.sleep(0.5) self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search' # 获取旗下网站信息 yield scrapy.Request( 'https://biz.lixiaoskb.com/api/opensearch/getDomains?id=' + id, headers=self.headers, meta={'company_id': company_id}, callback=self.get_webs ) time.sleep(0.5) # 获取股东信息 yield scrapy.Request( 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=3', headers=self.headers, meta={'company_id': company_id}, callback=self.get_shareholders ) time.sleep(0.5) # 获取招聘 yield scrapy.Request( 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=4', headers=self.headers, meta={'company_id': company_id, 'offset': 1, 'id': id, 'name': name}, callback=self.get_jobs ) time.sleep(0.5) # 获取网络推广 yield scrapy.Request( 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=5', headers=self.headers, meta={'company_id': company_id, 'offset': 1, 'id': id, 'name': name}, callback=self.get_sem ) # 获取网络推广 def get_sem(self, response): body = json.loads(response.body) company_id = response.meta['company_id'] offset = response.meta['offset'] id = response.meta['id'] name = response.meta['name'] items = body['data']['item'] if items: for item in items: keywords = ','.join(item['semKeywords']) self.cursor.execute( """insert into company_sems(company_id, sem_date, sem_title, sem_url, keywords, source_name) value (%s, %s, %s, %s, %s, %s)""", (company_id, item['semDate'], item['semTitle'], item['semUrl'], keywords, item['sourceName'], )) # 提交sql语句 self.connect.commit() time.sleep(1) offset = offset + 1 if offset % 5 == 0: time.sleep(5) self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search' yield scrapy.Request( 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + str( id) + '&tag=5' + '&offset=' + str(offset), headers=self.headers, meta={'company_id': company_id, 'offset': offset, 'id': id, 'name':name}, callback=self.get_sem ) # 获取旗下网站信息 def get_webs(self, response): body = json.loads(response.body) company_id = response.meta['company_id'] items = body['data']['items'] if items: for item in items: self.cursor.execute( """insert into company_domains(company_id, site_domain, site_home, site_name) value (%s, %s, %s, %s)""", (company_id, item['SITEDOMAIN'], item['SITEHOME'], item['SITENAME'])) # 提交sql语句 self.connect.commit() # 获取股东信息 def get_shareholders(self, response): body = json.loads(response.body) company_id = response.meta['company_id'] if body['data'].has_key('BE_INVEST'): items = body['data']['BE_INVEST']['data'] if items: for item in items: self.cursor.execute( """insert into company_shareholders(company_id, username, insto, amount, amount_paid) value (%s, %s, %s, %s, %s)""", (company_id, item['INV'], item['INSTO'], item['LIACCONAM'], item['LISUBCONAM'])) # 提交sql语句 self.connect.commit() # 获取招聘 def get_jobs(self, response): body = json.loads(response.body) company_id = response.meta['company_id'] offset = response.meta['offset'] id = response.meta['id'] name = response.meta['name'] items = body['data']['item'] if items: for item in items: self.cursor.execute( """insert into company_jobs(company_id, job_name, location, salary, job_url, release_date, source_name) value (%s, %s, %s, %s, %s, %s, %s)""", (company_id, item['jobName'], item['location'], item['salary'], item['jobUrl'], item['releaseDate'], item['sourceName'])) # 提交sql语句 self.connect.commit() time.sleep(1) offset = offset + 1 if offset % 5 == 0: time.sleep(5) self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search' yield scrapy.Request( 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + str( id) + '&tag=4' + '&offset=' + str(offset), headers=self.headers, meta={'company_id': company_id, 'offset': offset, 'id': id, 'name': name}, callback=self.get_jobs ) # 将时间格式化为时间戳 def format_time(self, mytime): if mytime: timeStamp = float(mytime / 1000) timeArray = time.localtime(timeStamp) return time.strftime("%Y-%m-%d", timeArray) return ''