lixiaosknb.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import json
  4. import pymysql.cursors
  5. import time
  6. import sys
  7. import random
  8. reload(sys)
  9. sys.setdefaultencoding('utf8')
  10. class LixiaosknbSpider(scrapy.Spider):
  11. name = 'lixiaosknb'
  12. allowed_domains = ['biz.lixiaoskb.com']
  13. start_urls = ['https://biz.lixiaoskb.com/login']
  14. headers = {
  15. 'Connection': 'keep-alive', # 保持链接状态
  16. 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
  17. 'Content-Type': 'application/json'
  18. }
  19. global company_id
  20. def __init__(self):
  21. self.file = open('items.json', 'wb')
  22. self.connect = pymysql.connect(host='localhost',
  23. user='root',
  24. password='root',
  25. db='swdz_crm',
  26. charset='utf8mb4',
  27. cursorclass=pymysql.cursors.DictCursor)
  28. self.cursor = self.connect.cursor()
  29. def parse(self, response):
  30. yield scrapy.FormRequest(
  31. url='https://biz.lixiaoskb.com/api/user/login',
  32. formdata={'username': '17381599246', 'password': 'yj870102722'},
  33. callback=self.after_login
  34. )
  35. def after_login(self, response):
  36. res = json.loads(response.body)
  37. print(res['data'])
  38. token = res['data']['token']
  39. self.headers['Authorization'] = token.encode('utf-8')
  40. for i in range(1, 2):
  41. myFormData = {
  42. "keyword": "",
  43. "filter": "{\"location\":[\"5101\"],\"industryshort\":[],\"registercapital\":\"0\",\"establishment\":\"0\",\"entstatus\":\"0\",\"enttype\":\"0\"}",
  44. "scope": "",
  45. "pagesize": "50",
  46. "page": i
  47. }
  48. print(i)
  49. if i % 10 == 0:
  50. time.sleep(random.randint(5, 30))
  51. time.sleep(0.5)
  52. yield scrapy.Request('https://biz.lixiaoskb.com/api/opensearch/search',
  53. method="POST",
  54. body=json.dumps(myFormData),
  55. headers=self.headers,
  56. callback=self.parse_item)
  57. def parse_item(self, response):
  58. body = json.loads(response.body)
  59. items = body['data']['items']
  60. i = 0
  61. for item in items:
  62. i = i+1
  63. id = item['id']
  64. market_company = item['value']
  65. time.sleep(random.random())
  66. if i % 10 == 0:
  67. time.sleep(random.randint(5, 10))
  68. # 获取基本信息
  69. self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + market_company + '&source=search'
  70. yield scrapy.Request(
  71. 'https://biz.lixiaoskb.com/api/opensearch/marketReport?id=' + id + '&market_company=' + market_company + '&market_source=search',
  72. headers=self.headers,
  73. meta={'id': id, 'name': market_company},
  74. callback=self.get_baseinfo
  75. )
  76. # 获取基本信息
  77. def get_baseinfo(self, response):
  78. body = json.loads(response.body)
  79. id = response.meta['id']
  80. name = response.meta['name']
  81. data = body['data']['baseinfo']
  82. annual_date = self.format_time(data['apprdate'])
  83. start_date = self.format_time(data['esdate'])
  84. opfrom = data['opfrom'] if data.has_key('opfrom') else ''
  85. opto = data['opto'] if data.has_key('opto') else ''
  86. officialWebsite = data['officialWebsite'] if data.has_key('officialWebsite') else ''
  87. regno = data['regno'] if data.has_key('regno') else ''
  88. legalperson = data['legalperson'] if data.has_key('legalperson') else ''
  89. address = data['address'] if data.has_key('address') else ''
  90. businessscope = data['businessscope'] if data.has_key('businessscope') else ''
  91. entstatus = data['entstatus'] if data.has_key('entstatus') else ''
  92. enttype = data['enttype'] if data.has_key('enttype') else ''
  93. industry = data['industry'] if data.has_key('industry') else ''
  94. regcap = data['regcap'] if data.has_key('regcap') else ''
  95. opentime = self.format_time(opfrom) + '-' + self.format_time(opto)
  96. self.cursor.execute(
  97. """insert into company_info(company_name, reg_no, legal_person, reg_addr, scope, open_status, ent_type, industry, annual_date,start_date,open_time,reg_capital,website)
  98. value (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
  99. (data['entname'], regno, legalperson, address, businessscope,
  100. entstatus, enttype, industry, annual_date, start_date, opentime, regcap,
  101. officialWebsite))
  102. # 提交sql语句
  103. self.connect.commit()
  104. company_id = self.cursor.lastrowid
  105. time.sleep(0.5)
  106. self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
  107. # 获取旗下网站信息
  108. yield scrapy.Request(
  109. 'https://biz.lixiaoskb.com/api/opensearch/getDomains?id=' + id,
  110. headers=self.headers,
  111. meta={'company_id': company_id},
  112. callback=self.get_webs
  113. )
  114. time.sleep(0.5)
  115. # 获取股东信息
  116. yield scrapy.Request(
  117. 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=3',
  118. headers=self.headers,
  119. meta={'company_id': company_id},
  120. callback=self.get_shareholders
  121. )
  122. time.sleep(0.5)
  123. # 获取招聘
  124. yield scrapy.Request(
  125. 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=4',
  126. headers=self.headers,
  127. meta={'company_id': company_id, 'offset': 1, 'id': id, 'name': name},
  128. callback=self.get_jobs
  129. )
  130. time.sleep(0.5)
  131. # 获取网络推广
  132. yield scrapy.Request(
  133. 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=5',
  134. headers=self.headers,
  135. meta={'company_id': company_id, 'offset': 1, 'id': id, 'name': name},
  136. callback=self.get_sem
  137. )
  138. # 获取网络推广
  139. def get_sem(self, response):
  140. body = json.loads(response.body)
  141. company_id = response.meta['company_id']
  142. offset = response.meta['offset']
  143. id = response.meta['id']
  144. name = response.meta['name']
  145. items = body['data']['item']
  146. if items:
  147. for item in items:
  148. keywords = ','.join(item['semKeywords'])
  149. self.cursor.execute(
  150. """insert into company_sems(company_id, sem_date, sem_title, sem_url, keywords, source_name)
  151. value (%s, %s, %s, %s, %s, %s)""",
  152. (company_id, item['semDate'], item['semTitle'], item['semUrl'], keywords, item['sourceName'],
  153. ))
  154. # 提交sql语句
  155. self.connect.commit()
  156. time.sleep(1)
  157. offset = offset + 1
  158. if offset % 5 == 0:
  159. time.sleep(5)
  160. self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
  161. yield scrapy.Request(
  162. 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + str(
  163. id) + '&tag=5' + '&offset=' + str(offset),
  164. headers=self.headers,
  165. meta={'company_id': company_id, 'offset': offset, 'id': id, 'name':name},
  166. callback=self.get_sem
  167. )
  168. # 获取旗下网站信息
  169. def get_webs(self, response):
  170. body = json.loads(response.body)
  171. company_id = response.meta['company_id']
  172. items = body['data']['items']
  173. if items:
  174. for item in items:
  175. self.cursor.execute(
  176. """insert into company_domains(company_id, site_domain, site_home, site_name)
  177. value (%s, %s, %s, %s)""",
  178. (company_id, item['SITEDOMAIN'], item['SITEHOME'], item['SITENAME']))
  179. # 提交sql语句
  180. self.connect.commit()
  181. # 获取股东信息
  182. def get_shareholders(self, response):
  183. body = json.loads(response.body)
  184. company_id = response.meta['company_id']
  185. if body['data'].has_key('BE_INVEST'):
  186. items = body['data']['BE_INVEST']['data']
  187. if items:
  188. for item in items:
  189. self.cursor.execute(
  190. """insert into company_shareholders(company_id, username, insto, amount, amount_paid)
  191. value (%s, %s, %s, %s, %s)""",
  192. (company_id, item['INV'], item['INSTO'], item['LIACCONAM'], item['LISUBCONAM']))
  193. # 提交sql语句
  194. self.connect.commit()
  195. # 获取招聘
  196. def get_jobs(self, response):
  197. body = json.loads(response.body)
  198. company_id = response.meta['company_id']
  199. offset = response.meta['offset']
  200. id = response.meta['id']
  201. name = response.meta['name']
  202. items = body['data']['item']
  203. if items:
  204. for item in items:
  205. self.cursor.execute(
  206. """insert into company_jobs(company_id, job_name, location, salary, job_url, release_date, source_name)
  207. value (%s, %s, %s, %s, %s, %s, %s)""",
  208. (company_id, item['jobName'], item['location'], item['salary'], item['jobUrl'], item['releaseDate'],
  209. item['sourceName']))
  210. # 提交sql语句
  211. self.connect.commit()
  212. time.sleep(1)
  213. offset = offset + 1
  214. if offset % 5 == 0:
  215. time.sleep(5)
  216. self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
  217. yield scrapy.Request(
  218. 'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + str(
  219. id) + '&tag=4' + '&offset=' + str(offset),
  220. headers=self.headers,
  221. meta={'company_id': company_id, 'offset': offset, 'id': id, 'name': name},
  222. callback=self.get_jobs
  223. )
  224. # 将时间格式化为时间戳
  225. def format_time(self, mytime):
  226. if mytime:
  227. timeStamp = float(mytime / 1000)
  228. timeArray = time.localtime(timeStamp)
  229. return time.strftime("%Y-%m-%d", timeArray)
  230. return ''