Update this article:
19-03-22 New collection2018 City data for
18-11-28 Collected2017 City data for

Data downloading GitHub:https://github.com/xiangyuecn/AreaCity-JsSpider-StatsGov/releases
Relevant updates, Please refer to other articles I published, The following content of this article will not be updated.

18-01-28 Morning6:30 Train, Back home from Sanya, Tickets are hard to buy.. So excited.~
statement: Data and third-party interface involved in this paper,url For learning purposes only, Please do not use it.~

These days, I have been building a local test environment, It is found that the data sheet of provincial and urban areas is empty, Thinking about the old data13 Annual collection, Including provinces, cities, districts and counties4 Level data sharing4.8 Ten thousand pieces, Time is long. Some new city name databases are not found in the use process, County level data has never been used, Think about it or collect a new one.

The newly collected provincial and urban data include3589 strip, We didn't collect county data this time, It's also good to add when you need it.

data sources

Statistical standard of National Bureau of Statistics《2016 Division code and urban and rural division code for annual statistics( End2016 year07 month31 day)》, This is2017-05-16 Released, It's current.

data acquisition

For data collection, According to work needs, Some contact with some small data acquisition functions. Because ofhtml andjs Ripe, Used a long time agoIE Browser to localhtml File supports any cross domainajax Request data, And support reading and writingExcel file, Just write onehtml File as a collection tool for others to use, Batch query personnel information, Functions of test results. Therefore, the main purpose of data collection is tojs.

1. Grab raw data

Open web pagehttp://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html
The province data is there, Enter the municipal page, Then go to the district level page, You can also enter the county page. The address structure of the whole process is very simple, Data format is also very easy to extract.

Open browser console after entering web page, Execute the following code, This code only contains the, Castrated the county,13 The old code of the year is county-level. Code written a long time ago, The style is a little ugly, But it's good to use it normally, This acquisition is“ Single threaded”, Because there's less data, It's not slow:
/* Get city namehttp://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html */
(function(){ if(!window.URL){ throw new Error(" Browser version too low"); }; function
ajax(url,True,False){ var ajax=new XMLHttpRequest(); ajax.timeout=1000;
ajax.open("GET",url); ajax.onreadystatechange=function(){
if(ajax.readyState==4){ if(ajax.status==200){ True(ajax.responseText); }else{
False(); } } } ajax.send(); } function msg(){ console.log.apply(console,
arguments); } function cityClass(name,url,code){ this.name=name; this.url=url;
this.code=code; this.child=[]; this.tryCount=0; } cityClass.prototype={
getValue:function(){ var obj={name:this.name,code:this.code,child:[]}; for(var
i=0;i<this.child.length;i++){ obj.child.push(this.child[i].getValue()); }
return obj; } } function load_all(True){ var
ajax(path+"/index.html",function(text){ var
reg=/href='(.+?)'>(.+?)<br/ig,match; var idx; if((idx=text.indexOf("<tr
class='provincetr'>"))+1){ reg.lastIndex=idx; while(match=reg.exec(text)){ var
url=match[1]; if(url.indexOf("//")==-1 && url.indexOf("/")!=0){
url=path+"/"+url; } var name=match[2]; DATA.push(new cityClass(name,url,0)); }
True(); }else{ msg(" No province data found"); } },function(){ msg(" Error reading province list"," Program termination"); }); }
function load_shen(True, False){ var city=DATA[JD.shen]; city.tryCount++;
if(city.tryCount>3){ msg(" Read Province["+city.name+"] Exceed3 second"); False(); return; };
function get(){ msg(" Read Province["+city.name+"]", getJD()); save();
city.child[JD.si].tryCount=0; load_si(function(){ JD.shen++;
if(JD.shen>=DATA.length){ JD.shen=0; True(); return; };
DATA[JD.shen].tryCount=0; load_shen(True,False); },function(){ False(); }); }
if(city.child.length){ get(); }else{ ajax(city.url,function(text){ var reg=/<tr
class='citytr'>.+?href='(.+?)'>(.+?)<.+?'>(.+?)</ig; var match;
while(match=reg.exec(text)){ var url=match[1]; if(url.indexOf("//")==-1 &&
url=city.url.substring(0,city.url.lastIndexOf("/"))+"/"+url; } var
code=match[2]; var name=match[3]; city.child.push(new
cityClass(name,url,code)); } JD.si=0; get(); },function(){
load_shen(True,False); }); }; } function load_si(True,False){ var
shen=DATA[JD.shen]; var city=shen.child[JD.si]; city.tryCount++;
if(city.tryCount>3){ msg(" Reading City["+city.name+"] Exceed3 second"); False(); return; };
function get(){ msg("___ Reading City["+city.name+"]", getJD());
city.child[JD.xian].tryCount=0; JD.si++; if(JD.si>=shen.child.length){ JD.si=0;
True(); return; }; shen.child[JD.si].tryCount=0; load_si(True,False); }
if(city.child.length){ get(); }else{ ajax(city.url,function(text){ var
reg=/class='(?:countytr|towntr)'.+?<\/tr>/ig; var match;
while(match=reg.exec(text)){ var reg2=/class='(?:countytr|towntr)'.+?(?:<td><a
href='(.+?)'>(.+?)<.+?'>(.+?)<|<td>(.+?)<.+?<td>(.+?)<)/ig; var match2;
if(match2=reg2.exec(match[0])){ var url=match2[1]||""; if(url.indexOf("//")==-1
&& url.indexOf("/")!=0){
url=city.url.substring(0,city.url.lastIndexOf("/"))+"/"+url; } var
code=match2[2]||match2[4]; var name=match2[3]||match2[5]; city.child.push(new
cityClass(name,url,code)); }else{ msg(" Unknown city mode:"); msg(city.url); msg(match[0]);
throw new Error("end"); } } JD.xian=0; get(); },function(){
load_si(True,False); }); }; } function getJD(){ var
str=" province:"+(JD.shen+1)+"/"+DATA.length; var shen=DATA[JD.shen]; if(shen){ str+="
city:"+(JD.si+1)+"/"+shen.child.length; var si=shen.child[JD.si]; if(si){ str+="
county:"+(JD.xian+1)+"/"+si.child.length; }else{ str+=" county:"+JD.xian; } }else{ str+="
city:"+JD.si+" county:"+JD.xian; } return str; } function save(){ } var DATA=[]; var
JD; window.RunLoad=function(shen,si,xian){ RunLoad.T1=Date.now(); JD={
shen:shen||0 ,si:si||0 ,xian:xian||0 } function get(){
DATA[JD.shen].tryCount=0; load_shen(function(){
console.log(" complete:"+(Date.now()-RunLoad.T1)/1000+" second"); save(); var data=[];
for(var i=0;i<DATA.length;i++){ data.push(DATA[i].getValue()); } var
url=URL.createObjectURL( new Blob([ new Uint8Array([0xEF,0xBB,0xBF]) ,"var
CITY_LIST=" ,JSON.stringify(data,null,"\t") ] ,{"type":"text/plain"}) ); var
downA=document.createElement("A"); downA.innerHTML=" Download and query the files of a good city";
downA.href=url; downA.download="data.txt"; document.body.appendChild(downA);
downA.click(); msg("-- complete--"); },function(){ save(); msg(" Current progress:", getJD()); });
} var data=localStorage["load_data"]; if(data){ DATA=JSON.parse(data); get();
}else{ load_all(get); } } })();//@ sourceURL=console.js // Execute code now RunLoad()
Capture screenshots:

2. Processing data and pinyin annotation

Data processing is easier, For example, number format, Name formatting, etc.

Pinyin tagging: We need to find an interface to translate Chinese characters into pinyin, There is only one requirement: Chongqing can translate intochong qing that will do, Translate intozhong
qing On thelow 了. Meet this condition, Translation websites searched on Baidu80% And he was killed.

Open the translation interface found in the browserhttp://www.qqxiuzi.cn/zh/pinyin/
, Up to now, it can be called normally, Because it needs to be used.ajax Request data, There is no cross domain problem in the page, View source code of webpage, holdtoken Value recorded, The translation request of this website needs to bring thistoken, Be careful~ Refresh page to retrieve:

Pinyin, because there's a lot of data, Adopted“4 Thread” collection, First, open the file collected in the first step, Copy the data to the open translation website browser console for execution( Equivalent to importing data), Then execute the following code:
/* Pinyin translation http://www.qqxiuzi.cn/zh/pinyin/
http://www.qqxiuzi.cn/zh/pinyin/show.php POST
t= Chinese characters&d=1&s=null&k=1&b=null&h=null&u=null&v=1&y=null&z=null&token= pagetoken Request a get
Load data first Console input data.txt */ window.PageToken=window.PageToken||""; var
FixTrim=function(name){ return name.replace(/^\s+|\s+$/g,""); }; var
CITY_LIST2; var QueryPinYin=function(end){ if(!window.PageToken){
console.error("Need PageToken"); return; }; var ids=[]; var
fixCode=function(o){ if(o.deep==0){ o.orgCode="0"; }else{ o.orgCode=o.code;
if(o.deep==1){ o.code=o.code.substr(o,4); }else{
o.code=o.code.replace(/(000000|000)$/g,"");// A few areas, many areas3 position }; }; return o; }; var
fix=function(o,p){ var name=o.name; if(o.deep==0){
name=name.replace(/( city| province|( Uygur| Zhuang Nationality| Hui nationality)? Autonomous Region)$/ig,""); }else if(o.deep==1){
if(name==" Municipal District"){ name=p.o2.name; }else if(/ administrative division$/ig.test(name)){ name=" Municipality directly under the Central Government";
}else if(name.length>2){ name=name.replace(/ city$/ig,""); }; }else{
if(name.length>2 && name!=" Municipal District" && !/( autonomy.| region| mining area)$/.test(name)){// A direct exclusion will have the same name
name=name.replace(/( city| area| county| town| Management Committee| Sub district office)$/ig,""); }; }; var o2={ name:name
,ext_name:o.name ,id:+o.code||0 ,ext_id:+o.orgCode ,pid:p&&+p.code||0
,deep:o.deep }; o.o2=o2; return o2; }; for(var i=0;i<CITY_LIST.length;i++){ var
shen=CITY_LIST[i]; shen.deep=0; for(var i2=0;i2<shen.child.length;i2++){ var
si=shen.child[i2]; if(!shen.code){ shen.code=si.code.substr(0,2);
ids.push(fix(fixCode(shen))); }; si.deep=1; ids.push(fix(fixCode(si),shen));
for(var i3=0;i3<si.child.length;i3++){ var qu=si.child[i3]; qu.deep=2;
ids.push(fix(fixCode(qu),si)); }; }; }; CITY_LIST2=ids;
//console.log(JSON.stringify(ids,null,"\t")) //return; var idx=-1; var
run=function(stack){ stack=+stack||0; idx++; if(idx>=ids.length){ thread--;
if(thread==0){ end(); }; return; }; var idx_=idx; var id=ids[idx]; if(id.P){
stack++; if(stack%50==0){ setTimeout(function(){run()}); }else{ run(stack); };
return; }; var name=id.name; var tryCount=0; var tryLoad=function(){ $.ajax({
,type:"POST" ,dataType:"text" ,timeout:1000 ,error:function(e){ if(tryCount>3){
console.error("--QueryPinYin error--"+e); run(); return; }; tryCount++;
tryLoad(); } ,success:function(txt){
txt=FixTrim(txt.replace(/<.+?>/g,"").replace(/\s+/g," ")); id.P=txt;
console.log("--"+idx_+"-QueryPinYin "+name+":"+txt+" --"); run(); } }); };
tryLoad(); }; var thread=4; run(); run(); run(); run(); }; var
ViewDown=function(){ console.log(" complete:"+(Date.now()-RunPinYin.T1)/1000+" second");
window.CITY_LIST_PINYIN=CITY_LIST2; var url=URL.createObjectURL( new Blob([ new
Uint8Array([0xEF,0xBB,0xBF]) ,"var CITY_LIST_PINYIN="
,JSON.stringify(CITY_LIST2,null,"\t") ] ,{"type":"text/plain"}) ); var
downA=document.createElement("A"); downA.innerHTML=" Download and query the files of a good city";
downA.href=url; downA.download="data-pinyin.txt";
document.body.appendChild(downA); downA.click(); }; var RunPinYin=function(){
RunPinYin.T1=Date.now(); QueryPinYin(ViewDown); }; // Execute code now
if(window.CITY_LIST){ if(!PageToken){ PageToken=prompt("Token"); };
RunPinYin(); }else{ console.error("data.txt No input"); };
You will be prompted to entertoken, Take what you just foundtoken Paste in, And then I started to work:

It's pretty fast.2 More than minutes, complete the translation.

3. Formatted intoCSV

We have all the data, Export to a more normal format,CSV Best.. This export is relatively simple, Any web console opens the second saved file, Copy data to any web console, Then enter the following code:
/* Format and output ascsv Load data first Console input data-pinyin.txt Import database: file formatUnicode, Text as character stream
inspectid Duplicate item, correctid To change intoarea_city Increase in Hong Kong, Macao and Taiwan, Two overseas provinces Check name duplicates, Modified name select * from area_city
where len(name)=1 select pid,name,count(*) from area_city group by pid,name
having COUNT(*)>1 */ var FixTrim=function(name){ return
name.replace(/^\s+|\s+$/g,""); }; function CSVName(name){ return
'"'+FixTrim(name).replace(/"/g,'""')+'"'; }; var
CITY_CSV=["id,pid,deep,name,pinyin_prefix,pinyin,ext_id,ext_name"]; for(var
i=0;i<CITY_LIST_PINYIN.length;i++){ var o=CITY_LIST_PINYIN[i]; var pf=""; var
pinyin=FixTrim(o.P).toLowerCase(); var ps=pinyin.split(" "); for(var
j=0;j<ps.length&&j<3;j++){ pf+=ps[j].substr(0,j==0?2:1); };
+","+CSVName(o.ext_id+"")+","+CSVName(o.ext_name)); }; var
url=URL.createObjectURL( new Blob([ new Uint8Array([0xEF,0xBB,0xBF])
,CITY_CSV.join("\n") ] ,{"type":"text/plain"}) ); var
downA=document.createElement("A"); downA.innerHTML=" Download and query the files of a good city";
downA.href=url; downA.download="ok_data.csv"; document.body.appendChild(downA);
OK, We're done with the data:

Data problem

id The number is basically the same as that of the National Bureau of statistics, Easy to update later.

id No duplicates at present( Optimized), However, after the previous collection, the number of the Bureau of statistics is simply shortened, and there will be duplication( It's a loss of precision).

The prefix of Pinyin is the first two letters and the last two initial letters of the first word, The intention is to make the first word with the same name sort together as much as possible. sort1: Heilongjianghelj, Hubeihub, Hunanhun; sort2:
Hubeihb, Heilongjianghlj, Hunanhn, Rank one to win.

Because the name of the district is to get rid of the market directly, Area suffix, There are so many pairs of names that become exactly the same, You need to manually add the city suffix, Otherwise, it will cause small problems.

The final data has been uploaded toCSDN, Including all codes and this document:http://download.csdn.net/download/xiangyuecn/10226964
,GitHub Download the latest data