123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261 |
- # -*- coding: utf-8 -*-
- import scrapy
- import json
- import pymysql.cursors
- import time
- import sys
- import random
- reload(sys)
- sys.setdefaultencoding('utf8')
- class LixiaosknbSpider(scrapy.Spider):
- name = 'lixiaosknb'
- allowed_domains = ['biz.lixiaoskb.com']
- start_urls = ['https://biz.lixiaoskb.com/login']
- headers = {
- 'Connection': 'keep-alive', # 保持链接状态
- 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
- 'Content-Type': 'application/json'
- }
- global company_id
- def __init__(self):
- self.file = open('items.json', 'wb')
- self.connect = pymysql.connect(host='localhost',
- user='root',
- password='root',
- db='swdz_crm',
- charset='utf8mb4',
- cursorclass=pymysql.cursors.DictCursor)
- self.cursor = self.connect.cursor()
- def parse(self, response):
- yield scrapy.FormRequest(
- url='https://biz.lixiaoskb.com/api/user/login',
- formdata={'username': '17381599246', 'password': 'yj870102722'},
- callback=self.after_login
- )
- def after_login(self, response):
- res = json.loads(response.body)
- print(res['data'])
- token = res['data']['token']
- self.headers['Authorization'] = token.encode('utf-8')
- for i in range(1, 2):
- myFormData = {
- "keyword": "",
- "filter": "{\"location\":[\"5101\"],\"industryshort\":[],\"registercapital\":\"0\",\"establishment\":\"0\",\"entstatus\":\"0\",\"enttype\":\"0\"}",
- "scope": "",
- "pagesize": "50",
- "page": i
- }
- print(i)
- if i % 10 == 0:
- time.sleep(random.randint(5, 30))
- time.sleep(0.5)
- yield scrapy.Request('https://biz.lixiaoskb.com/api/opensearch/search',
- method="POST",
- body=json.dumps(myFormData),
- headers=self.headers,
- callback=self.parse_item)
- def parse_item(self, response):
- body = json.loads(response.body)
- items = body['data']['items']
- i = 0
- for item in items:
- i = i+1
- id = item['id']
- market_company = item['value']
- time.sleep(random.random())
- if i % 10 == 0:
- time.sleep(random.randint(5, 10))
- # 获取基本信息
- self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + market_company + '&source=search'
- yield scrapy.Request(
- 'https://biz.lixiaoskb.com/api/opensearch/marketReport?id=' + id + '&market_company=' + market_company + '&market_source=search',
- headers=self.headers,
- meta={'id': id, 'name': market_company},
- callback=self.get_baseinfo
- )
- # 获取基本信息
- def get_baseinfo(self, response):
- body = json.loads(response.body)
- id = response.meta['id']
- name = response.meta['name']
- data = body['data']['baseinfo']
- annual_date = self.format_time(data['apprdate'])
- start_date = self.format_time(data['esdate'])
- opfrom = data['opfrom'] if data.has_key('opfrom') else ''
- opto = data['opto'] if data.has_key('opto') else ''
- officialWebsite = data['officialWebsite'] if data.has_key('officialWebsite') else ''
- regno = data['regno'] if data.has_key('regno') else ''
- legalperson = data['legalperson'] if data.has_key('legalperson') else ''
- address = data['address'] if data.has_key('address') else ''
- businessscope = data['businessscope'] if data.has_key('businessscope') else ''
- entstatus = data['entstatus'] if data.has_key('entstatus') else ''
- enttype = data['enttype'] if data.has_key('enttype') else ''
- industry = data['industry'] if data.has_key('industry') else ''
- regcap = data['regcap'] if data.has_key('regcap') else ''
- opentime = self.format_time(opfrom) + '-' + self.format_time(opto)
- self.cursor.execute(
- """insert into company_info(company_name, reg_no, legal_person, reg_addr, scope, open_status, ent_type, industry, annual_date,start_date,open_time,reg_capital,website)
- value (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
- (data['entname'], regno, legalperson, address, businessscope,
- entstatus, enttype, industry, annual_date, start_date, opentime, regcap,
- officialWebsite))
- # 提交sql语句
- self.connect.commit()
- company_id = self.cursor.lastrowid
- time.sleep(0.5)
- self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
- # 获取旗下网站信息
- yield scrapy.Request(
- 'https://biz.lixiaoskb.com/api/opensearch/getDomains?id=' + id,
- headers=self.headers,
- meta={'company_id': company_id},
- callback=self.get_webs
- )
- time.sleep(0.5)
- # 获取股东信息
- yield scrapy.Request(
- 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=3',
- headers=self.headers,
- meta={'company_id': company_id},
- callback=self.get_shareholders
- )
- time.sleep(0.5)
- # 获取招聘
- yield scrapy.Request(
- 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=4',
- headers=self.headers,
- meta={'company_id': company_id, 'offset': 1, 'id': id, 'name': name},
- callback=self.get_jobs
- )
- time.sleep(0.5)
- # 获取网络推广
- yield scrapy.Request(
- 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=5',
- headers=self.headers,
- meta={'company_id': company_id, 'offset': 1, 'id': id, 'name': name},
- callback=self.get_sem
- )
- # 获取网络推广
- def get_sem(self, response):
- body = json.loads(response.body)
- company_id = response.meta['company_id']
- offset = response.meta['offset']
- id = response.meta['id']
- name = response.meta['name']
- items = body['data']['item']
- if items:
- for item in items:
- keywords = ','.join(item['semKeywords'])
- self.cursor.execute(
- """insert into company_sems(company_id, sem_date, sem_title, sem_url, keywords, source_name)
- value (%s, %s, %s, %s, %s, %s)""",
- (company_id, item['semDate'], item['semTitle'], item['semUrl'], keywords, item['sourceName'],
- ))
- # 提交sql语句
- self.connect.commit()
- time.sleep(1)
- offset = offset + 1
- if offset % 5 == 0:
- time.sleep(5)
- self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
- yield scrapy.Request(
- 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + str(
- id) + '&tag=5' + '&offset=' + str(offset),
- headers=self.headers,
- meta={'company_id': company_id, 'offset': offset, 'id': id, 'name':name},
- callback=self.get_sem
- )
- # 获取旗下网站信息
- def get_webs(self, response):
- body = json.loads(response.body)
- company_id = response.meta['company_id']
- items = body['data']['items']
- if items:
- for item in items:
- self.cursor.execute(
- """insert into company_domains(company_id, site_domain, site_home, site_name)
- value (%s, %s, %s, %s)""",
- (company_id, item['SITEDOMAIN'], item['SITEHOME'], item['SITENAME']))
- # 提交sql语句
- self.connect.commit()
- # 获取股东信息
- def get_shareholders(self, response):
- body = json.loads(response.body)
- company_id = response.meta['company_id']
- if body['data'].has_key('BE_INVEST'):
- items = body['data']['BE_INVEST']['data']
- if items:
- for item in items:
- self.cursor.execute(
- """insert into company_shareholders(company_id, username, insto, amount, amount_paid)
- value (%s, %s, %s, %s, %s)""",
- (company_id, item['INV'], item['INSTO'], item['LIACCONAM'], item['LISUBCONAM']))
- # 提交sql语句
- self.connect.commit()
- # 获取招聘
- def get_jobs(self, response):
- body = json.loads(response.body)
- company_id = response.meta['company_id']
- offset = response.meta['offset']
- id = response.meta['id']
- name = response.meta['name']
- items = body['data']['item']
- if items:
- for item in items:
- self.cursor.execute(
- """insert into company_jobs(company_id, job_name, location, salary, job_url, release_date, source_name)
- value (%s, %s, %s, %s, %s, %s, %s)""",
- (company_id, item['jobName'], item['location'], item['salary'], item['jobUrl'], item['releaseDate'],
- item['sourceName']))
- # 提交sql语句
- self.connect.commit()
- time.sleep(1)
- offset = offset + 1
- if offset % 5 == 0:
- time.sleep(5)
- self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
- yield scrapy.Request(
- 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + str(
- id) + '&tag=4' + '&offset=' + str(offset),
- headers=self.headers,
- meta={'company_id': company_id, 'offset': offset, 'id': id, 'name': name},
- callback=self.get_jobs
- )
- # 将时间格式化为时间戳
- def format_time(self, mytime):
- if mytime:
- timeStamp = float(mytime / 1000)
- timeArray = time.localtime(timeStamp)
- return time.strftime("%Y-%m-%d", timeArray)
- return ''
|