wesley пре 6 година
комит
e5b20dc6f2

+ 7 - 0
.idea/misc.xml

xqd
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
+</project>

+ 8 - 0
.idea/modules.xml

xqd
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/qichacha.iml" filepath="$PROJECT_DIR$/.idea/qichacha.iml" />
+    </modules>
+  </component>
+</project>

+ 11 - 0
.idea/qichacha.iml

xqd
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="Unittests" />
+  </component>
+</module>

+ 291 - 0
.idea/workspace.xml

xqd
@@ -0,0 +1,291 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ChangeListManager">
+    <list default="true" id="a6c5f3bf-d8f6-46bd-9b88-5e775bfb08b7" name="Default Changelist" comment="" />
+    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="FUSProjectUsageTrigger">
+    <session id="-1691178588">
+      <usages-collector id="statistics.lifecycle.project">
+        <counts>
+          <entry key="project.closed" value="15" />
+          <entry key="project.open.time.0" value="4" />
+          <entry key="project.open.time.1" value="1" />
+          <entry key="project.open.time.10" value="1" />
+          <entry key="project.open.time.3" value="3" />
+          <entry key="project.open.time.4" value="3" />
+          <entry key="project.open.time.5" value="2" />
+          <entry key="project.open.time.7" value="1" />
+          <entry key="project.opened" value="15" />
+        </counts>
+      </usages-collector>
+      <usages-collector id="statistics.file.extensions.open">
+        <counts>
+          <entry key="html" value="5" />
+          <entry key="json" value="2" />
+          <entry key="py" value="6" />
+        </counts>
+      </usages-collector>
+      <usages-collector id="statistics.file.types.open">
+        <counts>
+          <entry key="HTML" value="5" />
+          <entry key="JSON" value="2" />
+          <entry key="Python" value="6" />
+        </counts>
+      </usages-collector>
+      <usages-collector id="statistics.file.extensions.edit">
+        <counts>
+          <entry key="html" value="1" />
+          <entry key="py" value="937" />
+        </counts>
+      </usages-collector>
+      <usages-collector id="statistics.file.types.edit">
+        <counts>
+          <entry key="HTML" value="1" />
+          <entry key="Python" value="937" />
+        </counts>
+      </usages-collector>
+    </session>
+  </component>
+  <component name="FileEditorManager">
+    <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/qichacha/spiders/qcc.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="-1015">
+              <caret line="44" column="12" selection-start-line="44" selection-start-column="8" selection-end-line="44" selection-end-column="12" />
+              <folding>
+                <element signature="e#24#34#0" expanded="true" />
+              </folding>
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/url.html">
+          <provider selected="true" editor-type-id="text-editor" />
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/qichacha/spiders/__init__.py">
+          <provider selected="true" editor-type-id="text-editor" />
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/qichacha/items.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="589">
+              <caret line="19" column="42" selection-start-line="19" selection-start-column="42" selection-end-line="19" selection-end-column="42" />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/qichacha/middlewares.py">
+          <provider selected="true" editor-type-id="text-editor" />
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="true">
+        <entry file="file://$PROJECT_DIR$/qichacha/pipelines.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="996">
+              <caret line="44" column="29" lean-forward="true" selection-start-line="25" selection-start-column="8" selection-end-line="44" selection-end-column="29" />
+              <folding>
+                <element signature="e#24#46#0" expanded="true" />
+                <marker date="1543197043246" expanded="true" signature="774:1569" ph="..." />
+              </folding>
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/qichacha/settings.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="2108">
+              <caret line="68" column="1" selection-start-line="68" selection-start-column="1" selection-end-line="68" selection-end-column="1" />
+            </state>
+          </provider>
+        </entry>
+      </file>
+    </leaf>
+  </component>
+  <component name="FindInProjectRecents">
+    <findStrings>
+      <find>cookie</find>
+      <find>pip</find>
+      <find>organization_code</find>
+      <find>registrate_num</find>
+    </findStrings>
+  </component>
+  <component name="IdeDocumentHistory">
+    <option name="CHANGED_PATHS">
+      <list>
+        <option value="$PROJECT_DIR$/qichacha/settings.py" />
+        <option value="$PROJECT_DIR$/qichacha/items.py" />
+        <option value="$PROJECT_DIR$/url.html" />
+        <option value="$PROJECT_DIR$/qichacha/pipelines.py" />
+        <option value="$PROJECT_DIR$/qichacha/spiders/qcc.py" />
+      </list>
+    </option>
+  </component>
+  <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
+  <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
+  <component name="JsGulpfileManager">
+    <detection-done>true</detection-done>
+    <sorting>DEFINITION_ORDER</sorting>
+  </component>
+  <component name="ProjectFrameBounds" fullScreen="true">
+    <option name="x" value="1440" />
+    <option name="y" value="-220" />
+    <option name="width" value="2560" />
+    <option name="height" value="1440" />
+  </component>
+  <component name="ProjectView">
+    <navigator proportions="" version="1">
+      <foldersAlwaysOnTop value="true" />
+    </navigator>
+    <panes>
+      <pane id="ProjectPane">
+        <subPane>
+          <expand>
+            <path>
+              <item name="qichacha" type="b2602c69:ProjectViewProjectNode" />
+              <item name="qichacha" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="qichacha" type="b2602c69:ProjectViewProjectNode" />
+              <item name="qichacha" type="462c0819:PsiDirectoryNode" />
+              <item name="qichacha" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="qichacha" type="b2602c69:ProjectViewProjectNode" />
+              <item name="qichacha" type="462c0819:PsiDirectoryNode" />
+              <item name="qichacha" type="462c0819:PsiDirectoryNode" />
+              <item name="spiders" type="462c0819:PsiDirectoryNode" />
+            </path>
+          </expand>
+          <select />
+        </subPane>
+      </pane>
+      <pane id="Scope" />
+    </panes>
+  </component>
+  <component name="PropertiesComponent">
+    <property name="WebServerToolWindowFactoryState" value="false" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$" />
+    <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
+    <property name="nodejs_npm_path_reset_for_default_project" value="true" />
+  </component>
+  <component name="RunDashboard">
+    <option name="ruleStates">
+      <list>
+        <RuleState>
+          <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
+        </RuleState>
+        <RuleState>
+          <option name="name" value="StatusDashboardGroupingRule" />
+        </RuleState>
+      </list>
+    </option>
+  </component>
+  <component name="SvnConfiguration">
+    <configuration />
+  </component>
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="a6c5f3bf-d8f6-46bd-9b88-5e775bfb08b7" name="Default Changelist" comment="" />
+      <created>1542874780271</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1542874780271</updated>
+    </task>
+    <servers />
+  </component>
+  <component name="ToolWindowManager">
+    <frame x="1440" y="-220" width="2560" height="1440" extended-state="0" />
+    <editor active="true" />
+    <layout>
+      <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.27760127" />
+      <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
+      <window_info id="Favorites" order="2" side_tool="true" />
+      <window_info anchor="bottom" id="Message" order="0" />
+      <window_info anchor="bottom" id="Find" order="1" />
+      <window_info anchor="bottom" id="Run" order="2" />
+      <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
+      <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
+      <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
+      <window_info anchor="bottom" id="TODO" order="6" />
+      <window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
+      <window_info anchor="bottom" id="Version Control" order="8" show_stripe_button="false" />
+      <window_info anchor="bottom" id="Database Changes" order="9" show_stripe_button="false" />
+      <window_info anchor="bottom" id="Event Log" order="10" side_tool="true" />
+      <window_info anchor="bottom" id="Terminal" order="11" />
+      <window_info anchor="bottom" id="Python Console" order="12" />
+      <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
+      <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
+      <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
+      <window_info anchor="right" id="SciView" order="3" />
+      <window_info anchor="right" id="Database" order="4" />
+    </layout>
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="1" />
+  </component>
+  <component name="VcsContentAnnotationSettings">
+    <option name="myLimit" value="2678400000" />
+  </component>
+  <component name="editorHistoryManager">
+    <entry file="file://$PROJECT_DIR$/items.json">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$PROJECT_DIR$/test.html" />
+    <entry file="file://$PROJECT_DIR$/url.html">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$PROJECT_DIR$/qichacha/spiders/__init__.py">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$PROJECT_DIR$/qichacha/settings.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="2108">
+          <caret line="68" column="1" selection-start-line="68" selection-start-column="1" selection-end-line="68" selection-end-column="1" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/qichacha/items.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="589">
+          <caret line="19" column="42" selection-start-line="19" selection-start-column="42" selection-end-line="19" selection-end-column="42" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/qichacha/middlewares.py">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$PROJECT_DIR$/qichacha/spiders/qcc.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="-1015">
+          <caret line="44" column="12" selection-start-line="44" selection-start-column="8" selection-end-line="44" selection-end-column="12" />
+          <folding>
+            <element signature="e#24#34#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/qichacha/pipelines.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="996">
+          <caret line="44" column="29" lean-forward="true" selection-start-line="25" selection-start-column="8" selection-end-line="44" selection-end-column="29" />
+          <folding>
+            <element signature="e#24#46#0" expanded="true" />
+            <marker date="1543197043246" expanded="true" signature="774:1569" ph="..." />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+  </component>
+</project>

Разлика између датотеке није приказан због своје велике величине
+ 7 - 0
items.json


+ 0 - 0
qichacha/__init__.py


BIN
qichacha/__init__.pyc


+ 35 - 0
qichacha/items.py

xqd
@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class QichachaItem(scrapy.Item):
+    name = scrapy.Field()
+    # phone = scrapy.Field()
+    website = scrapy.Field()
+    # email = scrapy.Field()
+    address = scrapy.Field()
+    registered_capital = scrapy.Field()  # 注册资本
+    # contributed_capital = scrapy.Field()  # 实缴资本
+    status = scrapy.Field()  # 经营状态
+    establishment = scrapy.Field()  # 成立日期
+    social_code = scrapy.Field()  # 统一社会信用代码
+    # taxpayer_num = scrapy.Field()  # 纳税人识别号
+    # registrate_num = scrapy.Field()  # 注册号
+    organization_code = scrapy.Field()  # 组织机构代码
+    company_type = scrapy.Field()  # 公司类型
+    industry_involed = scrapy.Field()  # 所属行业
+    approval_date = scrapy.Field()  # 核准日期
+    registration_authority = scrapy.Field()  # 登记机关
+    area = scrapy.Field()  # 所属地区
+    # english_name = scrapy.Field()  # 英文名
+    # used_name = scrapy.Field()  # 曾用名
+    # insured_num = scrapy.Field()  # 参保人数
+    # staff_size = scrapy.Field()  # 人员规模
+    operate_period = scrapy.Field()  # 营业期限
+    business_scope = scrapy.Field()  # 经营范围

BIN
qichacha/items.pyc


+ 103 - 0
qichacha/middlewares.py

xqd
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class QichachaSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class QichachaDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)

+ 47 - 0
qichacha/pipelines.py

xqd
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+import pymysql.cursors
+import json
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class QichachaPipeline(object):
+    def __init__(self):
+        self.file = open('items.json', 'wb')
+        self.connect = pymysql.connect(host='localhost',
+                        user='root',
+                        password='root',
+                        db='swdz_crm',
+                        charset='utf8mb4',
+                        cursorclass=pymysql.cursors.DictCursor)
+
+        self.cursor = self.connect.cursor()
+
+    def process_item(self, item, spider):
+        line = json.dumps(dict(item)) + "\n"
+        self.file.write(line)
+
+        self.cursor.execute(
+        """insert into company_info(companyName, regNo, orgNo, openStatus, openTime,startDate,annualDate, regCapital, industry, entType, authority, district, scope, website, regAddr)
+        value (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
+        (item['name'],
+        item['social_code'],
+        item['organization_code'],
+        item['status'],
+        item['operate_period'],
+        item['establishment'],
+        item['approval_date'],
+        item['registered_capital'],
+        item['industry_involed'],
+        item['company_type'],
+        item['registration_authority'],
+        item['area'],
+        item['business_scope'],
+        item['website'],
+        item['address']))
+        # 提交sql语句
+        self.connect.commit()
+
+        return item

BIN
qichacha/pipelines.pyc


+ 90 - 0
qichacha/settings.py

xqd
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for qichacha project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'qichacha'
+
+SPIDER_MODULES = ['qichacha.spiders']
+NEWSPIDER_MODULE = 'qichacha.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'qichacha (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = True
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'qichacha.middlewares.QichachaSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'qichacha.middlewares.QichachaDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'qichacha.pipelines.QichachaPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

BIN
qichacha/settings.pyc


+ 4 - 0
qichacha/spiders/__init__.py

xqd
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

BIN
qichacha/spiders/__init__.pyc


+ 217 - 0
qichacha/spiders/qcc.py

xqd
@@ -0,0 +1,217 @@
+# -*- coding: utf-8 -*-
+import sys
+import scrapy
+from qichacha.items import QichachaItem
+reload(sys)
+sys.setdefaultencoding("utf-8")
+
+class QccSpider(scrapy.Spider):
+    name = 'qcc'
+    allowed_domains = ['www.qichacha.com']
+
+    cookie = 'acw_tc=b68c8c9c15424252191214642e94214b05720c4e74420621051875d649; QCCSESSID=qk1ll94aagac8k2tatlo2h1h62; zg_did=%7B%22did%22%3A%20%221671fb5135712d-0407f0f915c2f4-35607400-384000-1671fb513589bd%22%7D; UM_distinctid=1671fb513f11ba-017043a1c22d06-35607400-384000-1671fb513f7ac9; _uab_collina=154242522035304047158746; hasShow=1; CNZZDATA1254842228=680175003-1542420654-https%253A%252F%252Fwww.qichacha.com%252F%7C1542868874; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1542593466,1542706763,1542866517,1542873588; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201542873588025%2C%22updated%22%3A%201542874174848%2C%22info%22%3A%201542425219932%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.baidu.com%22%2C%22cuid%22%3A%20%22a26c6fb1350ce17ade4209de83a1a0fa%22%7D; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1542874175'
+    itemDict = {}
+    items = cookie.split(';')
+    for item in items:
+        key = item.split('=')[0].replace(' ', '')
+        value = item.split('=')[1]
+        itemDict[key] = value
+
+    cookies = itemDict
+
+    headers = {
+        'Connection': 'keep-alive',  # 保持链接状态
+        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
+    }
+
+    def start_requests(self):
+        url = 'https://www.qichacha.com/search?key=%25E8%25BD%25AF%25E4%25BB%25B6%25E5%25BC%2580%25E5%258F%2591&ajaxflag=1&province=SC&city=510100&'
+        yield scrapy.Request(url, headers=self.headers, cookies=self.cookies, callback=self.parse)
+
+    def parse(self, response):
+        # urls = response.css('#result-list a::attr(href)').extract()
+        urls = response.xpath('//tbody//td/a/@href').extract()
+        for url in urls:
+            url = response.urljoin(url)
+
+            filename = 'url.html'
+            with open(filename, 'a+') as f:
+                f.write(url)
+                f.write('\n')
+
+            yield scrapy.Request(url, headers=self.headers, cookies=self.cookies, callback=self.parse_item)
+
+    def parse_item(self, response):
+        item = QichachaItem()
+
+        # 公司名
+        name = response.xpath('//div[@class="content"]/div[1]/h1/text()').extract_first()
+        item['name'] = name.strip().replace('\n', '') if name else '暂无公司名信息'
+    # 电话
+        # phone = response.xpath('//div[@class="content"]/div[2]/span[1]/span[2]/span/text()').extract_first()
+        # item['phone'] = phone.strip().replace('\n', '') if phone else '暂无电话信息'
+
+        # 官网
+        website = response.xpath('//div[@class="content"]/div[2]/span[3]/a/@href').extract_first()
+        item['website'] = website.strip().replace('\n', '') if website else '暂无网站信息'
+
+        # 邮箱
+        # email = response.xpath('//div[@class="content"]/div[3]/span[1]/span[2]/a/text()').extract_first()
+        # if email:
+        #     item['email'] = email
+        # else:
+        #     email2 = response.xpath('//div[@class="content"]/div[3]/span[2]/text()').extract_first()
+        #     item['email'] = email2
+
+        # 地址
+        address = response.xpath(
+            '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[10]/td[2]/text()').extract_first()
+        item['address'] = address.strip().replace('\n', '') if address else '暂无地址信息'
+
+        # 注册资本
+        registered_capital = response.xpath(
+            '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[1]/td[2]/text()').extract_first()
+        item['registered_capital'] = registered_capital.replace('\n', '').strip().split('万')[0] if registered_capital else '暂无注册资本'
+
+        # 实缴资本
+        # contributed_capital = response.xpath(
+        #     '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[1]/td[4]/text()').extract_first()
+        # if contributed_capital:
+        #     item['contributed_capital'] = contributed_capital.replace('\n', '').strip()
+        # else:
+        #     item['contributed_capital'] = '暂无实缴资本'
+
+        # 经营状态
+        status = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[2]/td[2]/text()').extract_first()
+        if status:
+            item['status'] = status.replace('\n', '').strip()
+        else:
+            item['status'] = '暂无经营状态信息'
+
+        # 成立日期
+        establishment = response.xpath(
+            '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[2]/td[4]/text()').extract_first()
+        if establishment:
+            item['establishment'] = establishment.replace('\n', '').strip()
+        else:
+            item['establishment'] = '暂无成立日期信息'
+
+        # 统一社会信用代码
+        social_code = response.xpath(
+            '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[3]/td[2]/text()').extract_first()
+        if social_code:
+            item['social_code'] = social_code.replace('\n', '').strip()
+        else:
+            item['social_code'] = '暂无统一社会信息代码信息'
+
+        # 纳税人识别号
+        # taxpayer_num = response.xpath(
+        #     '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[3]/td[4]/text()').extract_first()
+        # if taxpayer_num:
+        #     item['taxpayer_num'] = taxpayer_num.replace('\n', '').strip()
+        # else:
+        #     item['taxpayer_num'] = '暂无纳税人识别号信息'
+
+        # 注册号
+        # registrate_num = response.xpath(
+        #     '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[4]/td[2]/text()').extract_first()
+        # if registrate_num:
+        #     item['registrate_num'] = registrate_num.replace('\n', '').strip()
+        # else:
+        #     item['registrate_num'] = '暂无注册号信息'
+
+        # 组织机构代码
+        organization_code = response.xpath(
+            '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[4]/td[4]/text()').extract_first()
+        if organization_code:
+            item['organization_code'] = organization_code.replace('\n', '').strip()
+        else:
+            item['organization_code'] = '暂无组织机构代码信息'
+
+        # 公司类型
+        company_type = response.xpath(
+            '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[5]/td[2]/text()').extract_first()
+        if company_type:
+            item['company_type'] = company_type.replace('\n', '').strip()
+        else:
+            item['company_type'] = '暂无公司类型信息'
+
+        # 所属行业
+        industry_involed = response.xpath(
+            '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[5]/td[4]/text()').extract_first()
+        if industry_involed:
+            item['industry_involed'] = industry_involed.replace('\n', '').strip()
+        else:
+            item['industry_involed'] = '暂无所属行业信息'
+
+        # 核准日期
+        approval_date = response.xpath(
+            '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[6]/td[2]/text()').extract_first()
+        if approval_date:
+            item['approval_date'] = approval_date.replace('\n', '').strip()
+        else:
+            item['approval_date'] = '暂无核准日期信息'
+
+        # 登记机关
+        registration_authority = response.xpath(
+            '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[6]/td[4]/text()').extract_first()
+        if registration_authority:
+            item['registration_authority'] = registration_authority.replace('\n', '').strip()
+        else:
+            item['registration_authority'] = '暂无登记机关信息'
+
+        # 所属地区
+        area = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[7]/td[2]/text()').extract_first()
+        if area:
+            item['area'] = area.replace('\n', '').strip()
+        else:
+            item['area'] = '暂无所属地区信息'
+
+        # 英文名
+        # english_name = response.xpath(
+        #     '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[7]/td[4]/text()').extract_first()
+        # if english_name:
+        #     item['english_name'] = english_name.replace('\n', '').strip()
+        # else:
+        #     item['english_name'] = '暂无英文名信息'
+
+        # 曾用名
+        # used = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[8]/td[2]')
+        # used_name = used.xpath('string(.)').extract_first()
+        # if used_name:
+        #     item['used_name'] = used_name
+        # else:
+        #     item['used_name'] = '暂无曾用名'
+
+        # 参保人数
+        # insured_num = response.xpath(
+        #     '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[8]/td[4]/text()').extract_first()
+        # if insured_num:
+        #     item['insured_num'] = insured_num.replace('\n', '').strip()
+        # else:
+        #     item['insured_num'] = '暂无参保人数信息'
+
+        # 人员规模
+        # staff_size = response.xpath(
+        #     '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[9]/td[2]/text()').extract_first()
+        # if staff_size:
+        #     item['staff_size'] = staff_size.replace('\n', '').strip()
+        # else:
+        #     item['staff_size'] = '暂无人员规模信息'
+
+        # 营业期限
+        operate_period = response.xpath(
+            '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[9]/td[4]/text()').extract_first()
+        if operate_period:
+            item['operate_period'] = operate_period.replace('\n', '').strip()
+        else:
+            item['operate_period'] = '暂无营业期限信息'
+
+        # 经营范围
+        business_scope = response.xpath(
+            '//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[11]/td[2]/text()').extract_first()
+        if business_scope:
+            item['business_scope'] = business_scope.replace('\n', '').strip()
+        else:
+            item['business_scope'] = '暂无经营范围信息'
+        yield item

BIN
qichacha/spiders/qcc.pyc


+ 11 - 0
scrapy.cfg

xqd
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = qichacha.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = qichacha

+ 120 - 0
url.html

xqd
@@ -0,0 +1,120 @@
+https://www.qichacha.com/firm_3c690129a9984f64b4af1598a261f021.html
+https://www.qichacha.com/firm_6b7e6feab8f6b2a0d1eddd661a2458a7.html
+https://www.qichacha.com/firm_769c8d92fd9ead8bc1bb3a0457b2d73b.html
+https://www.qichacha.com/firm_8126203f2f3e895d08aa28732f53ed9c.html
+https://www.qichacha.com/firm_b6c5cd74cce2a37a77ca10947b5ba67d.html
+https://www.qichacha.com/firm_ebd5665040ba8846605bf4793d7a8afc.html
+https://www.qichacha.com/firm_0b8575c8ae8f72292de906cc51450134.html
+https://www.qichacha.com/firm_bcebec42625d991d395b5c97c29c1f28.html
+https://www.qichacha.com/firm_44cf38383c5803ad111e56d5fa3f305c.html
+https://www.qichacha.com/firm_2bb029ec1d940781e7dcca32071cbfb7.html
+https://www.qichacha.com/firm_3537688794fdbd9d8ba78d072ddfe56d.html
+https://www.qichacha.com/firm_3dcd1a5c3f2e65623d0c75e999acce3a.html
+https://www.qichacha.com/firm_dcdbb26384f0f128b15ded6d55ef02a3.html
+https://www.qichacha.com/firm_2dc1698f4759e2ecd6d37d1cc8511530.html
+https://www.qichacha.com/firm_d373b77533ba40a75350b076d01d0c11.html
+https://www.qichacha.com/firm_d7ef5b5b6249a877e14f181a1038bfb6.html
+https://www.qichacha.com/firm_06c36a33216ea230106d84e24a97d3ab.html
+https://www.qichacha.com/firm_e0aa312fc6336894a8f02638c714ba7d.html
+https://www.qichacha.com/firm_e30f01929ee3353a9cc2a6be1349f911.html
+https://www.qichacha.com/firm_79883ba991cbe7cf86deb1656cc16de1.html
+https://www.qichacha.com/firm_3c690129a9984f64b4af1598a261f021.html
+https://www.qichacha.com/firm_6b7e6feab8f6b2a0d1eddd661a2458a7.html
+https://www.qichacha.com/firm_769c8d92fd9ead8bc1bb3a0457b2d73b.html
+https://www.qichacha.com/firm_8126203f2f3e895d08aa28732f53ed9c.html
+https://www.qichacha.com/firm_b6c5cd74cce2a37a77ca10947b5ba67d.html
+https://www.qichacha.com/firm_ebd5665040ba8846605bf4793d7a8afc.html
+https://www.qichacha.com/firm_0b8575c8ae8f72292de906cc51450134.html
+https://www.qichacha.com/firm_bcebec42625d991d395b5c97c29c1f28.html
+https://www.qichacha.com/firm_44cf38383c5803ad111e56d5fa3f305c.html
+https://www.qichacha.com/firm_2bb029ec1d940781e7dcca32071cbfb7.html
+https://www.qichacha.com/firm_3537688794fdbd9d8ba78d072ddfe56d.html
+https://www.qichacha.com/firm_3dcd1a5c3f2e65623d0c75e999acce3a.html
+https://www.qichacha.com/firm_dcdbb26384f0f128b15ded6d55ef02a3.html
+https://www.qichacha.com/firm_2dc1698f4759e2ecd6d37d1cc8511530.html
+https://www.qichacha.com/firm_d373b77533ba40a75350b076d01d0c11.html
+https://www.qichacha.com/firm_d7ef5b5b6249a877e14f181a1038bfb6.html
+https://www.qichacha.com/firm_06c36a33216ea230106d84e24a97d3ab.html
+https://www.qichacha.com/firm_e0aa312fc6336894a8f02638c714ba7d.html
+https://www.qichacha.com/firm_e30f01929ee3353a9cc2a6be1349f911.html
+https://www.qichacha.com/firm_79883ba991cbe7cf86deb1656cc16de1.html
+https://www.qichacha.com/firm_3c690129a9984f64b4af1598a261f021.html
+https://www.qichacha.com/firm_6b7e6feab8f6b2a0d1eddd661a2458a7.html
+https://www.qichacha.com/firm_769c8d92fd9ead8bc1bb3a0457b2d73b.html
+https://www.qichacha.com/firm_8126203f2f3e895d08aa28732f53ed9c.html
+https://www.qichacha.com/firm_b6c5cd74cce2a37a77ca10947b5ba67d.html
+https://www.qichacha.com/firm_ebd5665040ba8846605bf4793d7a8afc.html
+https://www.qichacha.com/firm_0b8575c8ae8f72292de906cc51450134.html
+https://www.qichacha.com/firm_bcebec42625d991d395b5c97c29c1f28.html
+https://www.qichacha.com/firm_44cf38383c5803ad111e56d5fa3f305c.html
+https://www.qichacha.com/firm_2bb029ec1d940781e7dcca32071cbfb7.html
+https://www.qichacha.com/firm_3537688794fdbd9d8ba78d072ddfe56d.html
+https://www.qichacha.com/firm_3dcd1a5c3f2e65623d0c75e999acce3a.html
+https://www.qichacha.com/firm_dcdbb26384f0f128b15ded6d55ef02a3.html
+https://www.qichacha.com/firm_2dc1698f4759e2ecd6d37d1cc8511530.html
+https://www.qichacha.com/firm_d373b77533ba40a75350b076d01d0c11.html
+https://www.qichacha.com/firm_d7ef5b5b6249a877e14f181a1038bfb6.html
+https://www.qichacha.com/firm_06c36a33216ea230106d84e24a97d3ab.html
+https://www.qichacha.com/firm_e0aa312fc6336894a8f02638c714ba7d.html
+https://www.qichacha.com/firm_e30f01929ee3353a9cc2a6be1349f911.html
+https://www.qichacha.com/firm_79883ba991cbe7cf86deb1656cc16de1.html
+https://www.qichacha.com/firm_3c690129a9984f64b4af1598a261f021.html
+https://www.qichacha.com/firm_6b7e6feab8f6b2a0d1eddd661a2458a7.html
+https://www.qichacha.com/firm_769c8d92fd9ead8bc1bb3a0457b2d73b.html
+https://www.qichacha.com/firm_8126203f2f3e895d08aa28732f53ed9c.html
+https://www.qichacha.com/firm_b6c5cd74cce2a37a77ca10947b5ba67d.html
+https://www.qichacha.com/firm_ebd5665040ba8846605bf4793d7a8afc.html
+https://www.qichacha.com/firm_0b8575c8ae8f72292de906cc51450134.html
+https://www.qichacha.com/firm_bcebec42625d991d395b5c97c29c1f28.html
+https://www.qichacha.com/firm_44cf38383c5803ad111e56d5fa3f305c.html
+https://www.qichacha.com/firm_2bb029ec1d940781e7dcca32071cbfb7.html
+https://www.qichacha.com/firm_3537688794fdbd9d8ba78d072ddfe56d.html
+https://www.qichacha.com/firm_3dcd1a5c3f2e65623d0c75e999acce3a.html
+https://www.qichacha.com/firm_dcdbb26384f0f128b15ded6d55ef02a3.html
+https://www.qichacha.com/firm_2dc1698f4759e2ecd6d37d1cc8511530.html
+https://www.qichacha.com/firm_d373b77533ba40a75350b076d01d0c11.html
+https://www.qichacha.com/firm_d7ef5b5b6249a877e14f181a1038bfb6.html
+https://www.qichacha.com/firm_06c36a33216ea230106d84e24a97d3ab.html
+https://www.qichacha.com/firm_e0aa312fc6336894a8f02638c714ba7d.html
+https://www.qichacha.com/firm_e30f01929ee3353a9cc2a6be1349f911.html
+https://www.qichacha.com/firm_79883ba991cbe7cf86deb1656cc16de1.html
+https://www.qichacha.com/firm_3c690129a9984f64b4af1598a261f021.html
+https://www.qichacha.com/firm_6b7e6feab8f6b2a0d1eddd661a2458a7.html
+https://www.qichacha.com/firm_769c8d92fd9ead8bc1bb3a0457b2d73b.html
+https://www.qichacha.com/firm_8126203f2f3e895d08aa28732f53ed9c.html
+https://www.qichacha.com/firm_b6c5cd74cce2a37a77ca10947b5ba67d.html
+https://www.qichacha.com/firm_ebd5665040ba8846605bf4793d7a8afc.html
+https://www.qichacha.com/firm_0b8575c8ae8f72292de906cc51450134.html
+https://www.qichacha.com/firm_bcebec42625d991d395b5c97c29c1f28.html
+https://www.qichacha.com/firm_44cf38383c5803ad111e56d5fa3f305c.html
+https://www.qichacha.com/firm_2bb029ec1d940781e7dcca32071cbfb7.html
+https://www.qichacha.com/firm_3537688794fdbd9d8ba78d072ddfe56d.html
+https://www.qichacha.com/firm_3dcd1a5c3f2e65623d0c75e999acce3a.html
+https://www.qichacha.com/firm_dcdbb26384f0f128b15ded6d55ef02a3.html
+https://www.qichacha.com/firm_2dc1698f4759e2ecd6d37d1cc8511530.html
+https://www.qichacha.com/firm_d373b77533ba40a75350b076d01d0c11.html
+https://www.qichacha.com/firm_d7ef5b5b6249a877e14f181a1038bfb6.html
+https://www.qichacha.com/firm_06c36a33216ea230106d84e24a97d3ab.html
+https://www.qichacha.com/firm_e0aa312fc6336894a8f02638c714ba7d.html
+https://www.qichacha.com/firm_e30f01929ee3353a9cc2a6be1349f911.html
+https://www.qichacha.com/firm_79883ba991cbe7cf86deb1656cc16de1.html
+https://www.qichacha.com/firm_3c690129a9984f64b4af1598a261f021.html
+https://www.qichacha.com/firm_6b7e6feab8f6b2a0d1eddd661a2458a7.html
+https://www.qichacha.com/firm_769c8d92fd9ead8bc1bb3a0457b2d73b.html
+https://www.qichacha.com/firm_8126203f2f3e895d08aa28732f53ed9c.html
+https://www.qichacha.com/firm_b6c5cd74cce2a37a77ca10947b5ba67d.html
+https://www.qichacha.com/firm_ebd5665040ba8846605bf4793d7a8afc.html
+https://www.qichacha.com/firm_0b8575c8ae8f72292de906cc51450134.html
+https://www.qichacha.com/firm_bcebec42625d991d395b5c97c29c1f28.html
+https://www.qichacha.com/firm_44cf38383c5803ad111e56d5fa3f305c.html
+https://www.qichacha.com/firm_2bb029ec1d940781e7dcca32071cbfb7.html
+https://www.qichacha.com/firm_3537688794fdbd9d8ba78d072ddfe56d.html
+https://www.qichacha.com/firm_3dcd1a5c3f2e65623d0c75e999acce3a.html
+https://www.qichacha.com/firm_dcdbb26384f0f128b15ded6d55ef02a3.html
+https://www.qichacha.com/firm_2dc1698f4759e2ecd6d37d1cc8511530.html
+https://www.qichacha.com/firm_d373b77533ba40a75350b076d01d0c11.html
+https://www.qichacha.com/firm_d7ef5b5b6249a877e14f181a1038bfb6.html
+https://www.qichacha.com/firm_06c36a33216ea230106d84e24a97d3ab.html
+https://www.qichacha.com/firm_e0aa312fc6336894a8f02638c714ba7d.html
+https://www.qichacha.com/firm_e30f01929ee3353a9cc2a6be1349f911.html
+https://www.qichacha.com/firm_79883ba991cbe7cf86deb1656cc16de1.html

Неке датотеке нису приказане због велике количине промена