not hadoop but map/reduce
map/reduce 是很通用的,并非只有在hadoop上才能使用,不要被限制。处理已排序的列表,在内存和计算的消耗上都是很低的。Mapper
package mapreduce;import java.util.ArrayList;import java.util.List;import java.util.Map;import java.util.TreeMap;public abstract class Mapper<K0, V0, K, V> {private final Map<K, List<V>> map = new TreeMap<K, List<V>>();public Map<K, List<V>> getResult() {return map;}public abstract void map(K0 key, V0 value, Context context)throws RuntimeException;public class Context {public void write(K k, V v) {List<V> list = map.get(k);if (list == null) {list = new ArrayList<V>();}list.add(v);map.put(k, list);}}} Reducer
package mapreduce;import java.util.Map;import java.util.TreeMap;public abstract class Reducer<K, V, K1, V1> {private final TreeMap<K1, V1> map = new TreeMap<K1, V1>();public Map<K1, V1> getResult() {return map;}public abstract void reduce(K k, Iterable<V> list, Context context)throws RuntimeException;public class Context {public void write(K1 k, V1 v) {map.put(k, v);}}} Job
// 从数组读入,输出到Log,仅供原理演示
// 满屏的小黄叹号,告诉我们说java的泛型太tm难用了
package mapreduce;import java.util.Iterator;import java.util.List;import java.util.Map;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class Job {private static final Logger LOG = LoggerFactory.getLogger(Job.class);private Class<? extends Mapper> map;private Class<? extends Reducer> reduce;public void setMap(Class<? extends Mapper> map) {this.map = map;}public void setReduce(Class<? extends Reducer> reduce) {this.reduce = reduce;}public int run(String[] input) throws RuntimeException {int len = input.length;try {Mapper m = map.newInstance();for (int i = 0; i < len; i++) {m.map(0L + i, input, m.new Context());}Map<Object, List<Object>> mapResult = m.getResult();Reducer r = reduce.newInstance();Iterator<Object> it = mapResult.keySet().iterator();while (it.hasNext()) {Object k = it.next();r.reduce(k, mapResult.get(k), r.new Context());}Map<Object, Object> reduceResult = r.getResult();it = reduceResult.keySet().iterator();while (it.hasNext()) {Object k = it.next();LOG.info("{}\t{}", k.toString(), reduceResult.get(k));}} catch (InstantiationException e) {LOG.error("", e);throw new RuntimeException(e);} catch (IllegalAccessException e) {LOG.error("", e);throw new RuntimeException(e);} catch (Exception e) {LOG.error("", e);throw new RuntimeException(e);}return 1;}} 实例 wordcount
package mr.maillog;import java.io.IOException;import mapreduce.Job;import mapreduce.Mapper;import mapreduce.Reducer;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class WordCount {private static final Logger LOG = LoggerFactory.getLogger(WordCount.class);public static final class M extends Mapper<Long, String, String, Long> {@Overridepublic void map(Long key, String value, Context context)throws RuntimeException {String tmp = value.toLowerCase();String[] line = tmp.split("[\\s]+");for (String word : line) {context.write(word, 1L);}}}public static final class R extends Reducer<String, Long, String, Long> {@Overridepublic void reduce(String k, Iterable<Long> list, Context context)throws RuntimeException {Long count = 0L;for (Long item : list) {count += item;}context.write(k, count);}}/** * @param args * @throws IOException * @throws InterruptedException */public static void main(String[] args) throws IOException,InterruptedException {LOG.info("Hi");String[] data = {"What Is Apache Hadoop?","The Apache™ Hadoop™ project develops open-source software for reliable, scalable, distributed computing.","The Apache Hadoop software library is a framework that allows for the distributed processing of large data sets across clusters of computers using a simple programming model. It is designed to scale up from single servers to thousands of machines, each offering local computation and storage. Rather than rely on hardware to deliver high-avaiability, the library itself is designed to detect and handle failures at the application layer, so delivering a highly-availabile service on top of a cluster of computers, each of which may be prone to failures.","The project includes these subprojects:","Hadoop Common: The common utilities that support the other Hadoop subprojects.","Hadoop Distributed File System (HDFS™): A distributed file system that provides high-throughput access to application data.","Hadoop MapReduce: A software framework for distributed processing of large data sets on compute clusters.","Other Hadoop-related projects at Apache include:","Avro™: A data serialization system.","Cassandra™: A scalable multi-master database with no single points of failure.","Chukwa™: A data collection system for managing large distributed systems.","HBase™: A scalable, distributed database that supports structured data storage for large tables.","Hive™: A data warehouse infrastructure that provides data summarization and ad hoc querying.","Mahout™: A Scalable machine learning and data mining library.","Pig™: A high-level data-flow language and execution framework for parallel computation.","ZooKeeper™: A high-performance coordination service for distributed applications." };Job job = new Job();job.setMap(M.class);job.setReduce(R.class);job.run(data);}} 然后,输出了
12/07/04 16:05:31 INFO maillog.WordCount: Hi12/07/04 16:05:31 INFO mapreduce.Job: (hdfs™):112/07/04 16:05:31 INFO mapreduce.Job: a1412/07/04 16:05:31 INFO mapreduce.Job: access112/07/04 16:05:31 INFO mapreduce.Job: across112/07/04 16:05:31 INFO mapreduce.Job: ad112/07/04 16:05:31 INFO mapreduce.Job: allows112/07/04 16:05:31 INFO mapreduce.Job: and512/07/04 16:05:31 INFO mapreduce.Job: apache312/07/04 16:05:31 INFO mapreduce.Job: apache™112/07/04 16:05:31 INFO mapreduce.Job: application212/07/04 16:05:31 INFO mapreduce.Job: applications.112/07/04 16:05:31 INFO mapreduce.Job: at212/07/04 16:05:31 INFO mapreduce.Job: avro™:112/07/04 16:05:31 INFO mapreduce.Job: be112/07/04 16:05:31 INFO mapreduce.Job: cassandra™:112/07/04 16:05:31 INFO mapreduce.Job: chukwa™:112/07/04 16:05:31 INFO mapreduce.Job: cluster112/07/04 16:05:31 INFO mapreduce.Job: clusters112/07/04 16:05:31 INFO mapreduce.Job: clusters.112/07/04 16:05:31 INFO mapreduce.Job: collection112/07/04 16:05:31 INFO mapreduce.Job: common112/07/04 16:05:31 INFO mapreduce.Job: common:112/07/04 16:05:31 INFO mapreduce.Job: computation112/07/04 16:05:31 INFO mapreduce.Job: computation.112/07/04 16:05:31 INFO mapreduce.Job: compute112/07/04 16:05:31 INFO mapreduce.Job: computers112/07/04 16:05:31 INFO mapreduce.Job: computers,112/07/04 16:05:31 INFO mapreduce.Job: computing.112/07/04 16:05:31 INFO mapreduce.Job: coordination112/07/04 16:05:31 INFO mapreduce.Job: data812/07/04 16:05:31 INFO mapreduce.Job: data-flow112/07/04 16:05:31 INFO mapreduce.Job: data.112/07/04 16:05:31 INFO mapreduce.Job: database212/07/04 16:05:31 INFO mapreduce.Job: deliver112/07/04 16:05:31 INFO mapreduce.Job: delivering112/07/04 16:05:31 INFO mapreduce.Job: designed212/07/04 16:05:31 INFO mapreduce.Job: detect112/07/04 16:05:31 INFO mapreduce.Job: develops112/07/04 16:05:31 INFO mapreduce.Job: distributed812/07/04 16:05:31 INFO mapreduce.Job: each212/07/04 16:05:31 INFO mapreduce.Job: execution112/07/04 16:05:31 INFO mapreduce.Job: failure.112/07/04 16:05:31 INFO mapreduce.Job: failures112/07/04 16:05:31 INFO mapreduce.Job: failures.112/07/04 16:05:31 INFO mapreduce.Job: file212/07/04 16:05:31 INFO mapreduce.Job: for712/07/04 16:05:31 INFO mapreduce.Job: framework312/07/04 16:05:31 INFO mapreduce.Job: from112/07/04 16:05:31 INFO mapreduce.Job: hadoop512/07/04 16:05:31 INFO mapreduce.Job: hadoop-related112/07/04 16:05:31 INFO mapreduce.Job: hadoop?112/07/04 16:05:31 INFO mapreduce.Job: hadoop™112/07/04 16:05:31 INFO mapreduce.Job: handle112/07/04 16:05:31 INFO mapreduce.Job: hardware112/07/04 16:05:31 INFO mapreduce.Job: hbase™:112/07/04 16:05:31 INFO mapreduce.Job: high-avaiability,112/07/04 16:05:31 INFO mapreduce.Job: high-level112/07/04 16:05:31 INFO mapreduce.Job: high-performance112/07/04 16:05:31 INFO mapreduce.Job: high-throughput112/07/04 16:05:31 INFO mapreduce.Job: highly-availabile112/07/04 16:05:31 INFO mapreduce.Job: hive™:112/07/04 16:05:31 INFO mapreduce.Job: hoc112/07/04 16:05:31 INFO mapreduce.Job: include:112/07/04 16:05:31 INFO mapreduce.Job: includes112/07/04 16:05:31 INFO mapreduce.Job: infrastructure112/07/04 16:05:31 INFO mapreduce.Job: is412/07/04 16:05:31 INFO mapreduce.Job: it112/07/04 16:05:31 INFO mapreduce.Job: itself112/07/04 16:05:31 INFO mapreduce.Job: language112/07/04 16:05:31 INFO mapreduce.Job: large412/07/04 16:05:31 INFO mapreduce.Job: layer,112/07/04 16:05:31 INFO mapreduce.Job: learning112/07/04 16:05:31 INFO mapreduce.Job: library212/07/04 16:05:31 INFO mapreduce.Job: library.112/07/04 16:05:31 INFO mapreduce.Job: local112/07/04 16:05:31 INFO mapreduce.Job: machine112/07/04 16:05:31 INFO mapreduce.Job: machines,112/07/04 16:05:31 INFO mapreduce.Job: mahout™:112/07/04 16:05:31 INFO mapreduce.Job: managing112/07/04 16:05:31 INFO mapreduce.Job: mapreduce:112/07/04 16:05:31 INFO mapreduce.Job: may112/07/04 16:05:31 INFO mapreduce.Job: mining112/07/04 16:05:31 INFO mapreduce.Job: model.112/07/04 16:05:31 INFO mapreduce.Job: multi-master112/07/04 16:05:31 INFO mapreduce.Job: no112/07/04 16:05:31 INFO mapreduce.Job: of812/07/04 16:05:31 INFO mapreduce.Job: offering112/07/04 16:05:31 INFO mapreduce.Job: on312/07/04 16:05:31 INFO mapreduce.Job: open-source112/07/04 16:05:31 INFO mapreduce.Job: other212/07/04 16:05:31 INFO mapreduce.Job: parallel112/07/04 16:05:31 INFO mapreduce.Job: pig™:112/07/04 16:05:31 INFO mapreduce.Job: points112/07/04 16:05:31 INFO mapreduce.Job: processing212/07/04 16:05:31 INFO mapreduce.Job: programming112/07/04 16:05:31 INFO mapreduce.Job: project212/07/04 16:05:31 INFO mapreduce.Job: projects112/07/04 16:05:31 INFO mapreduce.Job: prone112/07/04 16:05:31 INFO mapreduce.Job: provides212/07/04 16:05:31 INFO mapreduce.Job: querying.112/07/04 16:05:31 INFO mapreduce.Job: rather112/07/04 16:05:31 INFO mapreduce.Job: reliable,112/07/04 16:05:31 INFO mapreduce.Job: rely112/07/04 16:05:31 INFO mapreduce.Job: scalable212/07/04 16:05:31 INFO mapreduce.Job: scalable,212/07/04 16:05:31 INFO mapreduce.Job: scale112/07/04 16:05:31 INFO mapreduce.Job: serialization112/07/04 16:05:31 INFO mapreduce.Job: servers112/07/04 16:05:31 INFO mapreduce.Job: service212/07/04 16:05:31 INFO mapreduce.Job: sets212/07/04 16:05:31 INFO mapreduce.Job: simple112/07/04 16:05:31 INFO mapreduce.Job: single212/07/04 16:05:31 INFO mapreduce.Job: so112/07/04 16:05:31 INFO mapreduce.Job: software312/07/04 16:05:31 INFO mapreduce.Job: storage112/07/04 16:05:31 INFO mapreduce.Job: storage.112/07/04 16:05:31 INFO mapreduce.Job: structured112/07/04 16:05:31 INFO mapreduce.Job: subprojects.112/07/04 16:05:31 INFO mapreduce.Job: subprojects:112/07/04 16:05:31 INFO mapreduce.Job: summarization112/07/04 16:05:31 INFO mapreduce.Job: support112/07/04 16:05:31 INFO mapreduce.Job: supports112/07/04 16:05:31 INFO mapreduce.Job: system312/07/04 16:05:31 INFO mapreduce.Job: system.112/07/04 16:05:31 INFO mapreduce.Job: systems.112/07/04 16:05:31 INFO mapreduce.Job: tables.112/07/04 16:05:31 INFO mapreduce.Job: than112/07/04 16:05:31 INFO mapreduce.Job: that512/07/04 16:05:31 INFO mapreduce.Job: the812/07/04 16:05:31 INFO mapreduce.Job: these112/07/04 16:05:31 INFO mapreduce.Job: thousands112/07/04 16:05:31 INFO mapreduce.Job: to612/07/04 16:05:31 INFO mapreduce.Job: top112/07/04 16:05:31 INFO mapreduce.Job: up112/07/04 16:05:31 INFO mapreduce.Job: using112/07/04 16:05:31 INFO mapreduce.Job: utilities112/07/04 16:05:31 INFO mapreduce.Job: warehouse112/07/04 16:05:31 INFO mapreduce.Job: what112/07/04 16:05:31 INFO mapreduce.Job: which112/07/04 16:05:31 INFO mapreduce.Job: with112/07/04 16:05:31 INFO mapreduce.Job: zookeeper™:1
p.s.
maven dependency
<dependency><groupId>org.slf4j</groupId><artifactId>slf4j-api</artifactId><version>1.6.5</version></dependency><dependency><groupId>org.slf4j</groupId><artifactId>slf4j-log4j12</artifactId><version>1.6.5</version></dependency>
页:
[1]