001 package org.hackystat.utilities.email; 002 003 import java.util.HashSet; 004 import java.util.Locale; 005 import java.util.Set; 006 import java.util.regex.Matcher; 007 import java.util.regex.Pattern; 008 009 import javax.mail.internet.AddressException; 010 import javax.mail.internet.InternetAddress; 011 012 /** 013 * Validate syntax of email addresses. Does not probe to see if mailserver exists in DNS or online. 014 * See MailProber for that. See ValidateEmailFile for an example of how to use this class. 015 * 016 * Minor reformatting. Found in: 017 * http://www.velocityreviews.com/forums/ 018 * t128486-re-can-javamail-detect-a-nonexistant-email-address.html 019 * 020 * @author Roedy Green, Canadian Mind Products 021 * @author Philip Johnson. 022 * @version 1.0 to do: check validity of & in first part of email address. Appears in practice. 023 */ 024 public final class ValidateEmailSyntax { 025 026 /** Make this class noninstantiable. */ 027 private ValidateEmailSyntax() { 028 // Do nothing. 029 } 030 031 /** 032 * Returns true if the email appears to be valid. 033 * @param email The email address of interest. 034 * @return True if it appears to be valid. 035 */ 036 public static boolean isValid(String email) { 037 return howValid(email) >= 2; 038 } 039 040 /** 041 * Check how likely an email address is to be valid. The higher the number returned, the more 042 * likely the address is valid. This method does not probe the internet in any way to see if the 043 * corresponding mail server or domain exists. 044 * 045 * @param email bare computer email address. e.g. No "Roedy Green" <> style addresses. No local 046 * addresses, eg. roedy. 047 * 048 * @return 0 = email address is definitely malformed, e.g. missing @. ends in .invalid <br> 049 * 1 = address does not meet one of the valid patterns below. It still might be ok 050 * according to some obscure rule in RFC 822 Java InternetAddress accepts it as valid. 051 * <br> 052 * 2 = unknown top level domain. <br> 053 * 3 = dots at beginning or end, doubled in name. <br> 054 * 4 = address of form xxx@[209.139.205.2] using IP <br> 055 * 5 = address of form Dots _ or - in first part of name <br> 056 * 6 = addreess of form rare, but known, domain <br> 057 * 7 = address of form or any national suffix. <br> 058 * 8 = address of form the matching this national suffix, e.g. .ca in Canada, .de in 059 * Germany <br> 060 * 9 = address of form .org .net .edu .gov ..biz -- official domains 061 */ 062 private static int howValid(String email) { //NOPMD (reassign email) 063 if (email == null) { 064 return 0; 065 } 066 email = email.trim().toLowerCase(); 067 int dotPlace = email.lastIndexOf('.'); 068 if (0 < dotPlace && dotPlace < email.length() - 1) { 069 /* have at least x.y */ 070 String tld = email.substring(dotPlace + 1); 071 if (badTLDs.contains(tld)) { 072 /* deliberate invalid address */ 073 return 0; 074 } 075 // make sure none of fragments start or end in _ or - 076 String[] fragments = splitter.split(email); 077 boolean clean = true; 078 for (int i = 0; i < fragments.length; i++) { 079 if (fragments[i].startsWith("_") || fragments[i].endsWith("_") 080 || fragments[i].startsWith("-") || fragments[i].endsWith("-")) { 081 clean = false; 082 break; 083 } 084 } // end for 085 if (clean) { 086 Matcher m9 = p9.matcher(email); 087 if (m9.matches()) { 088 if (officialTLDs.contains(tld)) { 089 return 9; 090 } 091 else if (thisCountry.equals(tld)) { 092 return 8; 093 } 094 else if (nationalTLDs.contains(tld)) { 095 return 7; 096 } 097 else if (rareTLDs.contains(tld)) { //NOPMD 098 return 6; 099 } 100 else { 101 return 3; /* unknown tld */ 102 } 103 } 104 // allow dots in name 105 Matcher m5 = p5.matcher(email); 106 if (m5.matches()) { 107 if (officialTLDs.contains(tld)) { 108 return 5; 109 } 110 else if (thisCountry.equals(tld)) { 111 return 5; 112 } 113 else if (nationalTLDs.contains(tld)) { 114 return 5; 115 } 116 else if (rareTLDs.contains(tld)) { 117 return 5; 118 } 119 else { 120 return 2; /* unknown tld */ 121 } 122 } 123 124 // IP 125 Matcher m4 = p4.matcher(email); 126 if (m4.matches()) { 127 return 4; /* can't tell TLD */ 128 } 129 130 // allow even lead/trail dots in name, except at start of domain 131 Matcher m3 = p3.matcher(email); 132 if (m3.matches()) { 133 if (officialTLDs.contains(tld)) { 134 return 3; 135 } 136 else if (thisCountry.equals(tld)) { 137 return 3; 138 } 139 else if (nationalTLDs.contains(tld)) { 140 return 3; 141 } 142 else if (rareTLDs.contains(tld)) { 143 return 3; 144 } 145 else { 146 return 2; /* unknown domain */ 147 } 148 } 149 } // end if clean 150 } 151 // allow even unclean addresses, and addresses without a TLD to have a whack at passing RFC:822 152 try { 153 154 /* 155 * see if InternetAddress likes it, it follows RFC:822. It will names without domains though. 156 */ 157 InternetAddress.parse(email, true /* strict */); 158 // it liked it, no exception happened. Seems very sloppy. 159 return 1; 160 } 161 catch (AddressException e) { 162 // it did not like it 163 return 0; 164 } 165 } 166 167 168 169 // allow _ - in name, lead and trailing ones are filtered later, no +. 170 private static Pattern p9 = Pattern.compile("[a-z0-9\\-_]++@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++"); 171 172 // to split into fields 173 private static Pattern splitter = Pattern.compile("[@\\.]"); 174 175 // to allow - _ dots in name, no + 176 private static Pattern p5 = Pattern 177 .compile("[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)*@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++"); 178 179 // IP style names, no + 180 private static Pattern p4 = Pattern 181 .compile("[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)*@\\[([0-9]{1,3}\\.){3}[0-9]{1,3}\\]"); 182 183 // allow dots anywhere, but not at start of domain name, no + 184 private static Pattern p3 = Pattern 185 .compile("[a-z0-9\\-_\\.]++@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++"); 186 187 /** 188 * build a HashSet from a array of String literals. 189 * 190 * @param list array of strings 191 * @return HashSet you can use to test if a string is in the set. 192 */ 193 private static Set<String> hmaker(String[] list) { 194 Set<String> map = new HashSet<String>(Math.max((int) (list.length / .75f) + 1, 16)); 195 for (int i = 0; i < list.length; i++) { 196 map.add(list[i]); 197 } 198 return map; 199 } 200 201 private static final String thisCountry = Locale.getDefault().getCountry().toLowerCase(); 202 203 private static final Set<String> officialTLDs = hmaker(new String[] { "aero", "biz", "coop", 204 "com", "edu", "gov", "info", "mil", "museum", "name", "net", "org", "pro", }); 205 206 private static final Set<String> rareTLDs = hmaker(new String[] { "cam", "mp3", "agent", 207 "art", "arts", "asia", "auction", "aus", "bank", "cam", "chat", "church", "club", "corp", 208 "dds", "design", "dns2go", "e", "email", "exp", "fam", "family", "faq", "fed", "film", 209 "firm", "free", "fun", "g", "game", "games", "gay", "ger", "globe", "gmbh", "golf", "gov", 210 "help", "hola", "i", "inc", "int", "jpn", "k12", "kids", "law", "learn", "llb", "llc", "llp", 211 "lnx", "love", "ltd", "mag", "mail", "med", "media", "mp3", "netz", "nic", "nom", "npo", 212 "per", "pol", "prices", "radio", "rsc", "school", "scifi", "sea", "service", "sex", "shop", 213 "sky", "soc", "space", "sport", "tech", "tour", "travel", "usvi", "video", "web", "wine", 214 "wir", "wired", "zine", "zoo", }); 215 216 private static final Set<String> nationalTLDs = hmaker(new String[] { "ac", "ad", "ae", "af", 217 "ag", "ai", "al", "am", "an", "ao", "aq", "ar", "as", "at", "au", "aw", "az", "ba", "bb", 218 "bd", "be", "bf", "bg", "bh", "bi", "bj", "bm", "bn", "bo", "br", "bs", "bt", "bv", "bw", 219 "by", "bz", "ca", "cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "cm", "cn", "co", "cr", 220 "cu", "cv", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do", "dz", "ec", "ee", "eg", "eh", 221 "er", "es", "et", "fi", "fj", "fk", "fm", "fo", "fr", "fx", "ga", "gb", "gd", "ge", "gf", 222 "gg", "gh", "gi", "gl", "gm", "gn", "gp", "gq", "gr", "gs", "gt", "gu", "gw", "gy", "hk", 223 "hm", "hn", "hr", "ht", "hu", "id", "ie", "il", "im", "in", "io", "iq", "ir", "is", "it", 224 "je", "jm", "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "kw", "ky", "kz", 225 "la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma", "mc", "md", "mg", 226 "mh", "mk", "ml", "mm", "mn", "mo", "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", 227 "my", "mz", "na", "nc", "ne", "nf", "ng", "ni", "nl", "no", "np", "nr", "nu", "nz", "om", 228 "pa", "pe", "pf", "pg", "ph", "pk", "pl", "pm", "pn", "pr", "ps", "pt", "pw", "py", "qa", 229 "re", "ro", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sj", "sk", "sl", 230 "sm", "sn", "so", "sr", "st", "sv", "sy", "sz", "tc", "td", "tf", "tg", "th", "tj", "tk", 231 "tm", "tn", "to", "tp", "tr", "tt", "tv", "tw", "tz", "ua", "ug", "uk", "um", "us", "uy", 232 "uz", "va", "vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yt", "yu", "za", "zm", 233 "zw", }); 234 235 private static final Set<String> badTLDs = hmaker(new String[] { "invalid", "nowhere", 236 "noone", }); 237 }