-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtool_index.php
More file actions
196 lines (180 loc) · 8.69 KB
/
tool_index.php
File metadata and controls
196 lines (180 loc) · 8.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
<?php
include('tool_include.php');
function echo_m_f($word){
echo iconv("UTF-8","GBK",$word);
echo "\n";
}
function logmsg($s){
file_put_contents('log.txt',$s,LOCK_EX);
}
function makefile($Industrylink,$Domain,$file_path){//将文件缓存到本地
$local_html_file = $file_path.str_replace("/","_",str_replace($Domain,'',str_replace('http://','',$Industrylink)));//生成本地连接
$http_html_file = $Domain.$Industrylink;//生成远程连接
if(!file_exists($local_html_file)){
$temp = '';
$loop = 10;
while(($temp == '')&&($loop != 0)){
$temp = get_html_page($http_html_file);
--$loop;
}
if($temp != ''){
file_put_contents($local_html_file,$temp,LOCK_EX);
echo_m_f('提取页面成功:'.$http_html_file);
echo_m_f('本地页面地址:'.$local_html_file);
$rand_var = mt_rand(0,300);
if($rand_var >200 ){
sleep(1);
}
else if($rand_var >100 ){
sleep(2);
}
else if($rand_var >50 ){
sleep(3);
}
else{
sleep(4);
}
echo "\n";
}
}
else{
echo_m_f('页面已经存在:'.$local_html_file);
}
return $local_html_file;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
function get_sub_file($web_file,$Domain,$file_path){//提取分页
$local_file = makefile($web_file,$Domain,$file_path);
$sub_link_list = array();//分页列表子连接
$loop = 1;//循环锁
while($loop == 1){
$loop = 0;
$_oop_html_Filter_class = html_Filter_function('mode:get_all:Category:->html->body->div#main->div.m-left channel list->div.list-pages:url:'.$local_file);
if(!isset($_oop_html_Filter_class->a_href_list[0])){
$sub_link_list[0] = $local_file;
return $sub_link_list;
}
$a_href_list_count_max = count($_oop_html_Filter_class->a_href_list)-1;
if(Search_array_Element($sub_link_list,$_oop_html_Filter_class->a_href_list[$a_href_list_count_max]) == -1){
$loop = 1;
}
$sub_link_list = array_link($sub_link_list,$_oop_html_Filter_class->a_href_list);
$local_file = makefile($web_file.str_replace($Domain,'',$_oop_html_Filter_class->a_href_list[$a_href_list_count_max]),$Domain,$file_path);
unset($_oop_html_Filter_class);
}
//获取子分页列表
$sub_link_list = array_unique($sub_link_list);
$sub_link_list = explode('|',implode('|',$sub_link_list));
$sub_link_list_count = 0;
while(isset($sub_link_list[$sub_link_list_count])){
if($web_file != str_replace($Domain,'',$sub_link_list[$sub_link_list_count])){
$sub_link_list[$sub_link_list_count] = makefile($web_file.str_replace($Domain,'',$sub_link_list[$sub_link_list_count]),$Domain,$file_path);
}
else{
$sub_link_list[$sub_link_list_count] = makefile(str_replace($Domain,'',$sub_link_list[$sub_link_list_count]),$Domain,$file_path);
}
echo $sub_link_list[$sub_link_list_count];
++$sub_link_list_count;
}
return $sub_link_list;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
function get_Article($local_file_array){
Global $text;
//提取文章
$Article_link_list_array = array();
$local_file_array_count = 0;
while(isset($local_file_array[$local_file_array_count])){
$temp_array = explode('/',$local_file_array[$local_file_array_count]);
if($temp_array[0] != 'file'){
$local_file_array[$local_file_array_count] = makefile($local_file_array[$local_file_array_count],'http://www.bozhidao.com','file/');
}
$_oop_html_Filter_class = array();
$_oop_html_Filter_class = html_Filter_function('mode:get_all:Category:->html->body->div#main->div.m-left channel list->div.list-article->ul:url:'.$local_file_array[$local_file_array_count]);
$Article_link_list_array = array_link($Article_link_list_array,$_oop_html_Filter_class->a_href_list);
unset($_oop_html_Filter_class);
++$local_file_array_count;
}//提取文章页面连接
$Article_link_list_array_count = 0;
while(isset($Article_link_list_array[$Article_link_list_array_count])){
$Article_link_list_array[$Article_link_list_array_count] = makefile($Article_link_list_array[$Article_link_list_array_count],'http://www.bozhidao.com','file/');
$str = mb_convert_encoding(file_get_contents($Article_link_list_array[$Article_link_list_array_count]), "UTF-8", "GBK");
file_put_contents($Article_link_list_array[$Article_link_list_array_count].'.utf.txt',$str,LOCK_EX);
$Article_link_list_array[$Article_link_list_array_count] = $Article_link_list_array[$Article_link_list_array_count].'.utf.txt';
echo_m_f($Article_link_list_array[$Article_link_list_array_count]);//显示本地文章页面
$_oop_html_Filter_class = array();
$_oop_html_Filter_class = html_Filter_function('mode:get_all:Category:->html->body->div#main->div.m-left content->div.article:url:'.$Article_link_list_array[$Article_link_list_array_count]);
$data_array_count = 0;
while(isset($_oop_html_Filter_class->data_array[$data_array_count])){
$temp = Tag_Or_No($_oop_html_Filter_class->data_array[$data_array_count]);
switch($temp){
case 'tag_p':
case 'tag_P':
case 'endtag_p':
case 'endtag_P':
case 'tag_div':
case 'tag_DIV':
case 'endtag_div':
case 'endtag_DIV':
case 'tag_span':
case 'endtag_span':
case 'Singletag_br':
$_oop_html_Filter_class->data_array[$data_array_count] = "\n";
break;
case 'tag_font':
case 'endtag_font':
case 'tag_a':
case 'tag_A':
case 'endtag_a':
case 'endtag_A':
case 'tag_strong':
case 'endtag_strong':
case 'Singletag_img':
$_oop_html_Filter_class->data_array[$data_array_count] = '';
break;
default:
$_oop_html_Filter_class->data_array[$data_array_count] = str_replace('‘','“',$_oop_html_Filter_class->data_array[$data_array_count]);
$_oop_html_Filter_class->data_array[$data_array_count] = str_replace('“','“',$_oop_html_Filter_class->data_array[$data_array_count]);
$_oop_html_Filter_class->data_array[$data_array_count] = str_replace('”','”',$_oop_html_Filter_class->data_array[$data_array_count]);
$_oop_html_Filter_class->data_array[$data_array_count] = str_replace('–','-',$_oop_html_Filter_class->data_array[$data_array_count]);
$_oop_html_Filter_class->data_array[$data_array_count] = str_replace(' ',' ',$_oop_html_Filter_class->data_array[$data_array_count]);
$_oop_html_Filter_class->data_array[$data_array_count] = str_replace('·','',$_oop_html_Filter_class->data_array[$data_array_count]);
$_oop_html_Filter_class->data_array[$data_array_count] = str_replace('…','',$_oop_html_Filter_class->data_array[$data_array_count]);
$_oop_html_Filter_class->data_array[$data_array_count] = str_replace('—','—',$_oop_html_Filter_class->data_array[$data_array_count]);
$_oop_html_Filter_class->data_array[$data_array_count] = str_replace('"','"',$_oop_html_Filter_class->data_array[$data_array_count]);
$_oop_html_Filter_class->data_array[$data_array_count] = str_replace('’','’',$_oop_html_Filter_class->data_array[$data_array_count]);
}
++$data_array_count;
}
//print_r($_oop_html_Filter_class->data_array);
$text = implode('',$_oop_html_Filter_class->data_array);
file_put_contents($Article_link_list_array[$Article_link_list_array_count],$text,LOCK_EX);
file_put_contents('file/text.txt',$Article_link_list_array[$Article_link_list_array_count].'|',FILE_APPEND);
file_put_contents('file/text.txt',"\n",FILE_APPEND);
unset($_oop_html_Filter_class);
++$Article_link_list_array_count;
}//下载文章页面
return $Article_link_list_array;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
$text = '';
file_put_contents('file/text.txt',$text,LOCK_EX);
$log_array = array();
$main_file = makefile('/','http://www.bozhidao.com','file/');
$Industry_link_array = array();
$_oop_html_Filter_class = html_Filter_function('mode:get_all:Category:->html->body->div#subnav:url:'.$main_file);
$Industry_link_array = $_oop_html_Filter_class->a_href_list;//主列表
$sub_list_array = array();
$Industry_link_array_count = 0;
while(isset($Industry_link_array[$Industry_link_array_count])){
$main_file = makefile($Industry_link_array[$Industry_link_array_count],'http://www.bozhidao.com','file/');
$sub_list_array = get_sub_file($Industry_link_array[$Industry_link_array_count],'http://www.bozhidao.com','file/');//获取子分页
get_Article($sub_list_array);//返回文章列表
++$Industry_link_array_count;
}
$file_list = file_get_contents('file/text.txt');
$file_list_array = explode("|\n",$file_list);
array_pop($file_list_array);
file_put_contents('file/text.txt',implode("|\n",$file_list_array),LOCK_EX);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
?>