Article update :
19-03-22 New acquisition 2018 City data for
18-11-28 Collected 2017 City data for

Data download GitHub:https://github.com/xiangyuecn/AreaCity-JsSpider-StatsGov/releases
<https://github.com/xiangyuecn/AreaCity-JsSpider-StatsGov/releases>
Relevant updates , Please refer to other articles I published , The following content of this article will not be updated .

18-01-28 morning 6:30 The train of , Back home from Sanya , It's hard to buy tickets . So excited ~
statement : Data and third-party interface involved in this paper ,url For learning purposes only , Do not use it ~


These days, I have been building a local test environment , It is found that the data sheet of provincial and urban areas is empty , Thinking about the old data 13 Collected in , Including provinces, cities, districts and counties 4 Level data in total 4.8 Ten thousand , It's been a long time , Some new city name databases are not found in the use process , County level data has never been used , Think about it or collect a new one .

The newly collected provincial and urban data include 3589 strip , We didn't collect county data this time , It's also good to add when you need it .

data sources

Statistical standard of National Bureau of Statistics 《2016 Division code and urban and rural division code for annual statistics ( end 2016 year 07 month 31 day )》, This is 2017-05-16 Published , It's current .


data acquisition


For data collection , According to work needs , Some contact with some small data acquisition functions . Because yes html and js Well done , Used a long time ago IE Browser to local html File supports any cross domain ajax Request data , And support reading and writing Excel file , Just write one html File as a collection tool for others to use , Batch query personnel information , Functions of test results . Therefore, the main purpose of data collection is to js.

1. Grab raw data

Open web page http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html
The province data is there , Enter the municipal page , Then go to the district level page , You can also enter the county page . The address structure of the whole process is very simple , Data format is also very easy to extract .


Open browser console after entering web page , Execute the following code , This code only contains the , Castrated the county ,13 The old code of the year is county-level . Code written a long time ago , The style is a little ugly , But it's good to use it normally , This acquisition is “ Single threaded ”, Because there's less data , It's not slow :
/* Get city name http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html */
(function(){ if(!window.URL){ throw new Error(" Browser version too low "); }; function
ajax(url,True,False){ var ajax=new XMLHttpRequest(); ajax.timeout=1000;
ajax.open("GET",url); ajax.onreadystatechange=function(){
if(ajax.readyState==4){ if(ajax.status==200){ True(ajax.responseText); }else{
False(); } } } ajax.send(); } function msg(){ console.log.apply(console,
arguments); } function cityClass(name,url,code){ this.name=name; this.url=url;
this.code=code; this.child=[]; this.tryCount=0; } cityClass.prototype={
getValue:function(){ var obj={name:this.name,code:this.code,child:[]}; for(var
i=0;i<this.child.length;i++){ obj.child.push(this.child[i].getValue()); }
return obj; } } function load_all(True){ var
path="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016";
ajax(path+"/index.html",function(text){ var
reg=/href='(.+?)'>(.+?)<br/ig,match; var idx; if((idx=text.indexOf("<tr
class='provincetr'>"))+1){ reg.lastIndex=idx; while(match=reg.exec(text)){ var
url=match[1]; if(url.indexOf("//")==-1 && url.indexOf("/")!=0){
url=path+"/"+url; } var name=match[2]; DATA.push(new cityClass(name,url,0)); }
True(); }else{ msg(" No province data found "); } },function(){ msg(" Error reading province list "," Termination of procedure "); }); }
function load_shen(True, False){ var city=DATA[JD.shen]; city.tryCount++;
if(city.tryCount>3){ msg(" Read Province ["+city.name+"] exceed 3 second "); False(); return; };
function get(){ msg(" Read Province ["+city.name+"]", getJD()); save();
city.child[JD.si].tryCount=0; load_si(function(){ JD.shen++;
if(JD.shen>=DATA.length){ JD.shen=0; True(); return; };
DATA[JD.shen].tryCount=0; load_shen(True,False); },function(){ False(); }); }
if(city.child.length){ get(); }else{ ajax(city.url,function(text){ var reg=/<tr
class='citytr'>.+?href='(.+?)'>(.+?)<.+?'>(.+?)</ig; var match;
while(match=reg.exec(text)){ var url=match[1]; if(url.indexOf("//")==-1 &&
url.indexOf("/")!=0){
url=city.url.substring(0,city.url.lastIndexOf("/"))+"/"+url; } var
code=match[2]; var name=match[3]; city.child.push(new
cityClass(name,url,code)); } JD.si=0; get(); },function(){
load_shen(True,False); }); }; } function load_si(True,False){ var
shen=DATA[JD.shen]; var city=shen.child[JD.si]; city.tryCount++;
if(city.tryCount>3){ msg(" Read City ["+city.name+"] exceed 3 second "); False(); return; };
function get(){ msg("___ Read City ["+city.name+"]", getJD());
city.child[JD.xian].tryCount=0; JD.si++; if(JD.si>=shen.child.length){ JD.si=0;
True(); return; }; shen.child[JD.si].tryCount=0; load_si(True,False); }
if(city.child.length){ get(); }else{ ajax(city.url,function(text){ var
reg=/class='(?:countytr|towntr)'.+?<\/tr>/ig; var match;
while(match=reg.exec(text)){ var reg2=/class='(?:countytr|towntr)'.+?(?:<td><a
href='(.+?)'>(.+?)<.+?'>(.+?)<|<td>(.+?)<.+?<td>(.+?)<)/ig; var match2;
if(match2=reg2.exec(match[0])){ var url=match2[1]||""; if(url.indexOf("//")==-1
&& url.indexOf("/")!=0){
url=city.url.substring(0,city.url.lastIndexOf("/"))+"/"+url; } var
code=match2[2]||match2[4]; var name=match2[3]||match2[5]; city.child.push(new
cityClass(name,url,code)); }else{ msg(" Unknown city mode :"); msg(city.url); msg(match[0]);
throw new Error("end"); } } JD.xian=0; get(); },function(){
load_si(True,False); }); }; } function getJD(){ var
str=" province :"+(JD.shen+1)+"/"+DATA.length; var shen=DATA[JD.shen]; if(shen){ str+="
city :"+(JD.si+1)+"/"+shen.child.length; var si=shen.child[JD.si]; if(si){ str+="
county :"+(JD.xian+1)+"/"+si.child.length; }else{ str+=" county :"+JD.xian; } }else{ str+="
city :"+JD.si+" county :"+JD.xian; } return str; } function save(){ } var DATA=[]; var
JD; window.RunLoad=function(shen,si,xian){ RunLoad.T1=Date.now(); JD={
shen:shen||0 ,si:si||0 ,xian:xian||0 } function get(){
DATA[JD.shen].tryCount=0; load_shen(function(){
console.log(" complete :"+(Date.now()-RunLoad.T1)/1000+" second "); save(); var data=[];
for(var i=0;i<DATA.length;i++){ data.push(DATA[i].getValue()); } var
url=URL.createObjectURL( new Blob([ new Uint8Array([0xEF,0xBB,0xBF]) ,"var
CITY_LIST=" ,JSON.stringify(data,null,"\t") ] ,{"type":"text/plain"}) ); var
downA=document.createElement("A"); downA.innerHTML=" Download and query the files of a good city ";
downA.href=url; downA.download="data.txt"; document.body.appendChild(downA);
downA.click(); msg("-- complete --"); },function(){ save(); msg(" Current progress :", getJD()); });
} var data=localStorage["load_data"]; if(data){ DATA=JSON.parse(data); get();
}else{ load_all(get); } } })();//@ sourceURL=console.js // Execute code now RunLoad()
Capture screenshot :


2. Processing data and pinyin annotation

Data processing is easier , For example, number format , Name formatting, etc .

Pinyin annotation : We need to find an interface to translate Chinese characters into pinyin , There is only one requirement : Chongqing can translate into chong qing that will do , Translated into zhong
qing Of course low 了 . Meet this condition , Translation websites searched on Baidu 80% And he was killed .

Open the translation interface found in the browser http://www.qqxiuzi.cn/zh/pinyin/
, Up to now, it can be called normally , Because it needs to be used ajax Request data , There is no cross domain problem in the page , View source code of webpage , hold token Value recorded , The translation request of this website needs to bring this token, be careful ~ Refresh page to retrieve :



Pinyin, because there's a lot of data , Adopted “4 Threads ” collection , First, open the file collected in the first step , Copy the data to the open translation website browser console for execution ( Equivalent to importing data ), Then execute the following code :
/* Pinyin translation http://www.qqxiuzi.cn/zh/pinyin/
http://www.qqxiuzi.cn/zh/pinyin/show.php POST
t= chinese characters &d=1&s=null&k=1&b=null&h=null&u=null&v=1&y=null&z=null&token= page token Request a get
Load data first Console input data.txt */ window.PageToken=window.PageToken||""; var
FixTrim=function(name){ return name.replace(/^\s+|\s+$/g,""); }; var
CITY_LIST2; var QueryPinYin=function(end){ if(!window.PageToken){
console.error("Need PageToken"); return; }; var ids=[]; var
fixCode=function(o){ if(o.deep==0){ o.orgCode="0"; }else{ o.orgCode=o.code;
if(o.deep==1){ o.code=o.code.substr(o,4); }else{
o.code=o.code.replace(/(000000|000)$/g,"");// A few areas, many areas 3 position }; }; return o; }; var
fix=function(o,p){ var name=o.name; if(o.deep==0){
name=name.replace(/( city | province |( Uygur | Zhuang Nationality | Hui nationality )? Autonomous Region )$/ig,""); }else if(o.deep==1){
if(name==" Municipal District "){ name=p.o2.name; }else if(/ administrative division $/ig.test(name)){ name=" municipality directly under the Central Government ";
}else if(name.length>2){ name=name.replace(/ city $/ig,""); }; }else{
if(name.length>2 && name!=" Municipal District " && !/( autonomy .| region | mining area )$/.test(name)){// A direct exclusion will have the same name
name=name.replace(/( city | area | county | town | Management Committee | Sub district office )$/ig,""); }; }; var o2={ name:name
,ext_name:o.name ,id:+o.code||0 ,ext_id:+o.orgCode ,pid:p&&+p.code||0
,deep:o.deep }; o.o2=o2; return o2; }; for(var i=0;i<CITY_LIST.length;i++){ var
shen=CITY_LIST[i]; shen.deep=0; for(var i2=0;i2<shen.child.length;i2++){ var
si=shen.child[i2]; if(!shen.code){ shen.code=si.code.substr(0,2);
ids.push(fix(fixCode(shen))); }; si.deep=1; ids.push(fix(fixCode(si),shen));
for(var i3=0;i3<si.child.length;i3++){ var qu=si.child[i3]; qu.deep=2;
ids.push(fix(fixCode(qu),si)); }; }; }; CITY_LIST2=ids;
//console.log(JSON.stringify(ids,null,"\t")) //return; var idx=-1; var
run=function(stack){ stack=+stack||0; idx++; if(idx>=ids.length){ thread--;
if(thread==0){ end(); }; return; }; var idx_=idx; var id=ids[idx]; if(id.P){
stack++; if(stack%50==0){ setTimeout(function(){run()}); }else{ run(stack); };
return; }; var name=id.name; var tryCount=0; var tryLoad=function(){ $.ajax({
url:"/zh/pinyin/show.php"
,data:"t="+encodeURIComponent(name)+"&d=1&s=null&k=1&b=null&h=null&u=null&v=1&y=null&z=null&token="+PageToken
,type:"POST" ,dataType:"text" ,timeout:1000 ,error:function(e){ if(tryCount>3){
console.error("--QueryPinYin error--"+e); run(); return; }; tryCount++;
tryLoad(); } ,success:function(txt){
txt=FixTrim(txt.replace(/<.+?>/g,"").replace(/\s+/g," ")); id.P=txt;
console.log("--"+idx_+"-QueryPinYin "+name+":"+txt+" --"); run(); } }); };
tryLoad(); }; var thread=4; run(); run(); run(); run(); }; var
ViewDown=function(){ console.log(" complete :"+(Date.now()-RunPinYin.T1)/1000+" second ");
window.CITY_LIST_PINYIN=CITY_LIST2; var url=URL.createObjectURL( new Blob([ new
Uint8Array([0xEF,0xBB,0xBF]) ,"var CITY_LIST_PINYIN="
,JSON.stringify(CITY_LIST2,null,"\t") ] ,{"type":"text/plain"}) ); var
downA=document.createElement("A"); downA.innerHTML=" Download and query the files of a good city ";
downA.href=url; downA.download="data-pinyin.txt";
document.body.appendChild(downA); downA.click(); }; var RunPinYin=function(){
RunPinYin.T1=Date.now(); QueryPinYin(ViewDown); }; // Execute code now
if(window.CITY_LIST){ if(!PageToken){ PageToken=prompt("Token"); };
RunPinYin(); }else{ console.error("data.txt Not entered "); };
You will be prompted to enter token, Take what you just found token Paste in , And then I started to work :


It's fast ,2 More than minutes, complete the translation .

3. Format as CSV

We have all the data , Export to a more normal format ,CSV Best . This export is relatively simple , Any web console opens the second saved file , Copy data to any web console , Then enter the following code :
/* Format and output as csv Load data first Console input data-pinyin.txt Import database : file format Unicode, Text as character stream
inspect id Duplicates , correct id to change into area_city Increase in Hong Kong, Macao and Taiwan , Two overseas provinces Check name duplicates , Amendment name select * from area_city
where len(name)=1 select pid,name,count(*) from area_city group by pid,name
having COUNT(*)>1 */ var FixTrim=function(name){ return
name.replace(/^\s+|\s+$/g,""); }; function CSVName(name){ return
'"'+FixTrim(name).replace(/"/g,'""')+'"'; }; var
CITY_CSV=["id,pid,deep,name,pinyin_prefix,pinyin,ext_id,ext_name"]; for(var
i=0;i<CITY_LIST_PINYIN.length;i++){ var o=CITY_LIST_PINYIN[i]; var pf=""; var
pinyin=FixTrim(o.P).toLowerCase(); var ps=pinyin.split(" "); for(var
j=0;j<ps.length&&j<3;j++){ pf+=ps[j].substr(0,j==0?2:1); };
CITY_CSV.push(o.id+","+o.pid+","+o.deep+","+CSVName(o.name)
+","+CSVName(pf)+","+CSVName(o.P)
+","+CSVName(o.ext_id+"")+","+CSVName(o.ext_name)); }; var
url=URL.createObjectURL( new Blob([ new Uint8Array([0xEF,0xBB,0xBF])
,CITY_CSV.join("\n") ] ,{"type":"text/plain"}) ); var
downA=document.createElement("A"); downA.innerHTML=" Download and query the files of a good city ";
downA.href=url; downA.download="ok_data.csv"; document.body.appendChild(downA);
downA.click();
OK, We're done with the data :


Data issues

*
id The number is basically the same as that of the National Bureau of statistics , Easy to update later .

*
id No duplicates at present ( Optimized ), However, after the previous collection, the number of the Bureau of statistics is simply shortened, and there will be duplication ( It's a loss of precision ).

*
The prefix of Pinyin is the first two letters and the last two initial letters of the first word , The intention is to make the first word with the same name sort together as much as possible . sort 1: Heilongjiang helj, Hubei hub, Hunan hun; sort 2:
Hubei hb, Heilongjiang hlj, Hunan hn, Rank one to win .

*
Because the name of the district is to get rid of the market directly , Zone suffix , There are so many pairs of names that become exactly the same , You need to manually add the city suffix , Otherwise, it will cause small problems .

*
The final data has been uploaded to CSDN, Including all codes and this document :http://download.csdn.net/download/xiangyuecn/10226964
,GitHub Download the latest data
<https://github.com/xiangyuecn/AreaCity-JsSpider-StatsGov/releases>