您现在的位置是: 首页 > Hadoop实战-浏览器统计


Hadoop实战-浏览器统计

纯真年代 2018-09-17 23:41:05 0 481人围观

如图:


通过这我们提取浏览器信息,(这些日志是我做博客时产生的)

UserAgent工具类 直达车

    1)git下来

    2)安装到本地maven仓库

mvn clean install -DskipTests


    3)pom.xml中引入依赖

<!-- 添加UserAgentParser解析依赖 -->
<dependency>
<groupId>com.kumkee</groupId>
<artifactId>UserAgentParser</artifactId>
<version>0.0.1</version>
</dependency>

    4)测试(一条)-查看测试结果

@Test
public void testUserAgentParser() {
String source = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36";
UserAgentParser userAgentParser = new UserAgentParser();
UserAgent agent = userAgentParser.parse(source);

String browser = agent.getBrowser();
String engine = agent.getEngine();
String engineVersion = agent.getEngineVersion();
String os = agent.getOs();
String platform = agent.getPlatform();
String version = agent.getVersion();
boolean mobile = agent.isMobile();
System.err.println(browser + " , " + engine + " , " + engineVersion + " , " + os + " , " + platform + " , "
+ version + " , " + mobile);
}



    5)单机版测试

@Test
public void testReadFile() throws Exception {
String path = "C:\\Users\\An-Il\\Desktop\\openresty\\logs\\access.log";
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path))));
String line = "";
Integer count = 0;

Map<String, Integer> borwserMap = new HashMap<String, Integer>();

UserAgentParser userAgentParser = new UserAgentParser();
while (line != null) {
//一次读入一条数据
line = reader.readLine();
if (StringUtils.isNotBlank(line)) {
count++;
String source = line.substring(getCharacterPosition(line, "\"", 5) + 1,
getCharacterPosition(line, "\"", 6));
UserAgent agent = userAgentParser.parse(source);
String browser = agent.getBrowser();
String engine = agent.getEngine();
String engineVersion = agent.getEngineVersion();
String os = agent.getOs();
String platform = agent.getPlatform();
String version = agent.getVersion();
boolean mobile = agent.isMobile();

Integer browserNum = borwserMap.get(browser);
if (browserNum != null) {
borwserMap.put(browser, browserNum + 1);
} else {
borwserMap.put(browser, 1);
}

System.err.println(browser + " , " + engine + " , " + engineVersion + " , " + os + " , " + platform
+ " , " + version + " , " + mobile);
}
}
System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
System.err.println("总记录数:" + count.toString());
for (Map.Entry<String, Integer> entry : borwserMap.entrySet()) {
System.err.println(entry.getKey() + ":" + entry.getValue());
}
}

/**
* 获取制定字符串总制定标识的字符串出现的索引位置
* @Title: getCharacterPosition
* @param @param value
* @param @param operator
* @param @param index
* @param @return
* @return int
* @throws
*/
private int getCharacterPosition(String value, String operator, int index) {
Matcher matcher = Pattern.compile(operator).matcher(value);
int midx = 0;
while (matcher.find()) {
midx++;
if (midx == index) {
break;
}
}
return matcher.start();
}


基于MapReduce完成此功能

    1)代码

package cn.project;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.kumkee.userAgent.UserAgent;
import com.kumkee.userAgent.UserAgentParser;

/**
*
* @ClassName: LogApp
* @Description: 统计浏览器的访问次数
* @author: yy
* @date: 2018年9月19日 下午7:30:06
*
* @Copyright: 2018 www.xxx.com Inc. All rights reserved.
* @note: 注意:本内容仅限于xxx公司内部传阅,禁止外泄以及用于其他的商业目
*/
public class LogApp {

/**
* Map:要读取输入的文件
*/
public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {

LongWritable one = new LongWritable(1);

private UserAgentParser userAgentParser;

@Override
protected void setup(Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
userAgentParser = new UserAgentParser();
}

@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
//接受到的每一行数据(其实就是一行日志信息)
String line = value.toString();

String source = line.substring(getCharacterPosition(line, "\"", 5) + 1,
getCharacterPosition(line, "\"", 6));
UserAgent agent = userAgentParser.parse(source);
String browser = agent.getBrowser();

//通过上下文吧map的处理结果输出
context.write(new Text(browser), one);
}

@Override
protected void cleanup(Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
userAgentParser = null;
}

}

/**
* Reduce:归并操作
*/
public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {

@Override
protected void reduce(Text key, Iterable<LongWritable> values,
Reducer<Text, LongWritable, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
long sum = 0;
for (LongWritable value : values) {
//求浏览器次数总和
sum += value.get();
}
//最终将统计结果输出
context.write(key, new LongWritable(sum));
}
}

/**
* 定义Driver:封装MapReduce作业的所有信息
*/
public static void main(String[] args) throws Exception {

//1.创建Configuration
Configuration configuration = new Configuration();

//准备清理已经存在的输出
Path outputPath = new Path(args[1]);
FileSystem fileSystem = FileSystem.get(configuration);
if (fileSystem.exists(outputPath)) {
fileSystem.delete(outputPath, true);
System.out.println("The output file already exists but has been deleted");
}
//2.创建Job
Job job = Job.getInstance(configuration, "LogApp");

//2.1设置job的处理类
job.setJarByClass(LogApp.class);

//3.设置锁业处理的输入路径
FileInputFormat.setInputPaths(job, new Path(args[0]));

//设置map相关的参数
job.setMapperClass(LogApp.MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);

//设置reduce相关参数
job.setReducerClass(LogApp.MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);

//设置作业处理结果的输出路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

/**
* 获取指定字符串总制定标识的字符串出现的索引位置
* @Title: getCharacterPosition
* @param @param value
* @param @param operator
* @param @param index
* @param @return
* @return int
* @throws
*/
private static int getCharacterPosition(String value, String operator, int index) {
Matcher matcher = Pattern.compile(operator).matcher(value);
int midx = 0;
while (matcher.find()) {
midx++;
if (midx == index) {
break;
}
}
return matcher.start();
}
}

    2)打包(我们需要将UserAgent依赖的包打进了,这里用了插件)

    这里hadoop的就不需要打进去了,因为服务器已经有hadoop相关jar包了


<!--打包命令: mvn assembly:assembly -DskipTests -->
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass></mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
</build>

    3)上传-运行-查看结果

hadoop jar hadoop-train-1.0-jar-with-dependencies.jar cn.project.LogApp hdfs://192.168.31.140:8020/access40000.log hdfs://192.168.31.140:8020/browserwc



说明:整理于网络 
本文章作为个人笔记,也供大家监督 

 

全部评论