中国土地市场网爬取

八卦谈佚名 ▪ 2022-12-11 19:05:15

实习期间老板需要对土地数据进行爬取，想来想去就Google到了中国土地网(https://www.landchina.com/default.aspx?tabid=263.应该算是种类最齐全的土地数据了。

移步CSDN或者Github阅读体验更佳哟

Github地址：https://github.com/AnTi-anti/china_land/tree/master
CSDN：https://blog.csdn.net/weixin_35770067/article/details/106735919

目标分析

需要提取的信息就是上方最终表格的信息。不同于上一篇爬取徐州市自然资源和规划局土地数据](https://blog.csdn.net/weixin_35770067/article/details/106734339).这次爬取会涉及到几个难点。

网页结构分析

我们首先进入官网，依次点击土地供应，结果公告。

就进入了接下来这个页面。

因为我们需要的是2015-2020年的数据。而且是根据行政区来依次进行爬取。所以，肯定需要用到selenium进行爬取。和之前那一篇结构类似。也是先爬取土地坐落的链接，然后再爬取链接的详情页面。

难点

第一种情况就是会出现禁止访问500,，一开始用的是免费的代理池不断更换IP，但是由于IP有限，所以爬取速度还不如没有代理的快些。后面就放弃了使用代理的想法。所以说遇到这种情况，只能暂时中止。但是在爬取链接页面的时候这种情况不会出现，而在对详情页进行提取的时候会出现。这时候我采取的措施是直接剔除掉已经爬取的链接，对剩余没有用到的链接继续提取详情页信息。

一开始我使用的是自己的宽带和局域网，经常会出现这种禁止访问的情况。但是后来我使用了华为的服务器竟然就没有这种情况。而且在同一时间段，我本机和与云服务器同时在跑，本地就禁止，而云服务器依旧运行，很奇怪，如果有懂硬件的朋友了解的话，可以评论区告诉我，我也不知道自己猜的对不对。

第二种情况就是会在爬取的过程中频繁出现验证码的步骤。这个也不难，我们可以直接对其进行识别。如果出现验证码，则对其进行识别；否则，继续进行爬取。

def img_down_load(img):

host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=MQE9mLzD9296AQQ7byq40Iud&client_secret=n1ElwPtvGTBua67hyLIPZtp5IGciGGjV'

response = requests.get(host)

request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic"

# 二进制方式打开图片文件

img = img.split(",")

params = {"image": img[1]}

access_token = response.json()['access_token']

request_url = request_url + "?access_token=" + access_token

headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}

# headers = {'content-type': 'application/x-www-form-urlencoded'}

response = requests.post(request_url, data=params, headers=headers)

if response:

counts = response.json()['words_result'][0]["words"]

return counts

第三种就是只显示前200页的数据。如果你是翻页进行爬取的话，那么只有200页的数据可以获取到，这时候我们就需要对时间进行筛选，在上述筛选框输入时间即可，再启动翻页设置。

第四种就是行政区域的设置。我们通过对源代码进行分析，发现可以通过输入城市的身份证前4位进行判别。

数据爬取

第一步还是老样子，先对链接进行爬取。

import requests

def img_down_load(img):

host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=MQE9mLzD9296AQQ7byq40Iud&client_secret=n1ElwPtvGTBua67hyLIPZtp5IGciGGjV'

response = requests.get(host)

request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic"

# 二进制方式打开图片文件

img = img.split(",")

params = {"image": img[1]}

access_token = response.json()['access_token']

request_url = request_url + "?access_token=" + access_token

headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}

# headers = {'content-type': 'application/x-www-form-urlencoded'}

response = requests.post(request_url, data=params, headers=headers)

if response:

counts = response.json()['words_result'][0]["words"]

return counts

import time,json,random

from test import img_down_load

from selenium import webdriver

opt = webdriver.ChromeOptions()

headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}

opt.add_argument('--user-agent=%s' % headers)

#opt.add_argument("--proxy-server=http://202.20.16.82:10152")

driver = webdriver.Chrome(options=opt)

driver.get("https://www.landchina.com/default.aspx?tabid=263&ComName=default")

time.sleep(random.randint(2,5))

while True:

if driver.find_element_by_xpath("/html/body/p/p[2]/table/tbody/tr[2]/td/input").get_attribute("value") == "点击继续访问网站":

img = driver.find_element_by_xpath("/html/body/p/p[2]/table/tbody/tr[1]/td[3]/img").get_attribute("src")

a = img_down_load(img)

driver.find_element_by_xpath("//*[@id='intext']").send_keys(a)

driver.find_element_by_xpath("/html/body/p/p[2]/table/tbody/tr[2]/td/input").click()

time.sleep(random.randint(2,5))

try:

driver.find_element_by_id("TAB_QueryConditionItem256").click()

driver.execute_script("document.getElementById('TAB_queryTblEnumItem_256_v').setAttribute('type', 'text');")

driver.find_element_by_id('TAB_queryTblEnumItem_256_v').clear()

driver.find_element_by_id("TAB_queryTblEnumItem_256_v").send_keys('3203')

driver.find_element_by_id("TAB_QueryButtonControl").click()

break

except:

driver.refresh()

continue

list_info = []

time.sleep(random.randint(2,5))

try:

if driver.find_element_by_xpath("/html/body/p/p[2]/table/tbody/tr[2]/td/input").get_attribute("value") == "点击继续访问网站":

img = driver.find_element_by_xpath("/html/body/p/p[2]/table/tbody/tr[1]/td[3]/img").get_attribute("src")

a = img_down_load(img)

driver.find_element_by_xpath("//*[@id='intext']").send_keys(a)

driver.find_element_by_xpath("/html/body/p/p[2]/table/tbody/tr[2]/td/input").click()

driver.find_element_by_id("TAB_QueryConditionItem256").click()

driver.execute_script("document.getElementById('TAB_queryTblEnumItem_256_v').setAttribute('type', 'text');")

driver.find_element_by_id('TAB_queryTblEnumItem_256_v').clear()

driver.find_element_by_id("TAB_queryTblEnumItem_256_v").send_keys('3203')

driver.find_element_by_id("TAB_QueryButtonControl").click()

except:

pass

num = 183

for pages in range(1,num):

for i in range(2,32):

try:

if driver.find_element_by_xpath("/html/body/p/p[2]/table/tbody/tr[2]/td/input").get_attribute("value") == "点击继续访问网站":

img = driver.find_element_by_xpath("/html/body/p/p[2]/table/tbody/tr[1]/td[3]/img").get_attribute("src")

a = img_down_load(img)

driver.find_element_by_xpath("//*[@id='intext']").send_keys(a)

driver.find_element_by_xpath("/html/body/p/p[2]/table/tbody/tr[2]/td/input").click()

except:

pass

try:

urls = driver.find_element_by_xpath("//*[@id='TAB_contentTable']/tbody/tr[%d]/td[3]/a"%i).get_attribute("href")

except:

driver.refresh()

num += 1

continue

print(urls)

list_info.append(urls)

try:

driver.find_element_by_xpath("//*[@id='mainModuleContainer_485_1113_1539_tdExtendProContainer']/table/tbody/tr[1]/td/table/tbody/tr[2]/td/p/table/tbody/tr/td[2]/a[12]").click()

except:

pass

time.sleep(random.randint(2,5))

with open("徐州.json", "w", encoding="utf8") as f:

json.dump(list_info,f,indent=1)

driver.quit()

我们将爬取到的链接保存在json文件里

第二步就是对链接进行详情页提取。

import time,json,random

from test import img_down_load

from selenium import webdriver

with open("徐州.json", "r", encoding="utf8") as f:

urls = json.load(f)

opt = webdriver.ChromeOptions()

headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}

opt.add_argument('--user-agent=%s' % headers)

driver = webdriver.Chrome(options=opt)

for url in urls:

print(url)

driver.get(url)

time.sleep(random.randint(2,5))

tudi_dict = {}

tudi_list = []

while True:

try:

if driver.find_element_by_xpath("/html/body/p/p[2]/table/tbody/tr[2]/td/input").get_attribute("value") == "点击继续访问网站":

img = driver.find_element_by_xpath("/html/body/p/p[2]/table/tbody/tr[1]/td[3]/img").get_attribute("src")

a = img_down_load(img)

driver.find_element_by_xpath("//*[@id='intext']").send_keys(a)

driver.find_element_by_xpath("/html/body/p/p[2]/table/tbody/tr[2]/td/input").click()

time.sleep(random.randint(2,5))

except:

print("没有验证码")

print("开始查找")

try:

xingzhengqu_key = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r1_c1_ctrl']").get_attribute("textContent")

try:

xingzhengqu_value = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r1_c2_ctrl']").get_attribute("textContent")

except:

xingzhengqu_value = ""

print(xingzhengqu_key, xingzhengqu_value)

xiangmu_name_key = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r17_c1_ctrl']").get_attribute("textContent")

try:

xiangmu_name_value = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r17_c2_ctrl']").get_attribute("textContent")

except:

xiangmu_name_value = ""

print(xiangmu_name_key, xiangmu_name_value)

xiangmu_weizhi_key = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r16_c1_ctrl']").get_attribute("textContent")

try:

xiangmu_weizhi_value = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r16_c2_ctrl']").get_attribute("textContent")

except:

xiangmu_weizhi_value = ""

print(xiangmu_weizhi_key, xiangmu_weizhi_value)

mianji_key = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r2_c1_ctrl']").get_attribute("textContent")

try:

mianji_value = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r2_c2_ctrl']").get_attribute("textContent")

except:

mianji_value = ""

print(mianji_key, mianji_value)

yongtu_key = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r3_c1_ctrl']").get_attribute("textContent")

try:

yongtu_value = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r3_c2_ctrl']").get_attribute("textContent")

except:

yongtu_value = ""

print(yongtu_key, yongtu_value)

fangshi_key = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r3_c3_ctrl']").get_attribute("textContent")

try:

fangshi_value = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r3_c4_ctrl']").get_attribute("textContent")

except:

fangshi_value = ""

print(fangshi_key, fangshi_value)

years_key = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r19_c1_ctrl']").get_attribute("textContent")

try:

years_value = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r19_c2_ctrl']").get_attribute("textContent")

except:

years_value = ""

print(years_key, years_value)

hangye_type_key = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r19_c3_ctrl']").get_attribute("textContent")

try:

hangye_type_value = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r19_c4_ctrl']").get_attribute("textContent")

except:

hangye_type_value = ""

print(hangye_type_key, hangye_type_value)

pice_key = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r20_c3_ctrl']").get_attribute("textContent")

try:

pice_value = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r20_c4_ctrl']").get_attribute("textContent")

except:

pice_value = ""

print(pice_key, pice_value)

shiyong_key = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r9_c1_ctrl']").get_attribute("textContent")

try:

shiyong_value_1 = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r9_c2_ctrl']").get_attribute("textContent")

except:

shiyong_value_1 = ""

try:

shiyong_value_2 = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r23_c2_ctrl']").get_attribute("textContent")

except:

shiyong_value_2 = ""

print(shiyong_key, shiyong_value_1+shiyong_value_2)

rongjilv_key = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r21_c1_ctrl']").get_attribute("textContent")

try:

rongjilv_next_value = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f2_r1_c2_ctrl']").get_attribute("textContent")

except:

rongjilv_next_value = ""

try:

rongjilv_up_value = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f2_r1_c4_ctrl']").get_attribute("textContent")

except:

rongjilv_up_value = ""

print(rongjilv_key, rongjilv_next_value, rongjilv_up_value)

riqi_key = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r14_c3_ctrl']").get_attribute("textContent")

try:

riqi_value = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r14_c4_ctrl']").get_attribute("textContent")

except:

riqi_value = ""

print(riqi_key, riqi_value)

gongkai_time_key = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r22_c1_ctrl']").get_attribute("textContent")

try:

gongkai_time_value = driver.find_element_by_xpath("//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r22_c2_ctrl']").get_attribute("textContent")

except:

gongkai_time_value = ""

print(gongkai_time_key,gongkai_time_value)

suogong_time_key = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r22_c3_ctrl']").get_attribute("textContent")

try:

suogong_time_value = driver.find_element_by_xpath(

"//*[@id='mainModuleContainer_1855_1856_ctl00_ctl00_p1_f1_r22_c4_ctrl']").get_attribute("textContent")

except:

suogong_time_value = ""

print(suogong_time_key, suogong_time_value)

tudi_dict.update({xingzhengqu_key: xingzhengqu_value, xiangmu_name_key: xiangmu_name_value,