001    package org.hackystat.utilities.email;
002    
003    import java.util.HashSet;
004    import java.util.Locale;
005    import java.util.Set;
006    import java.util.regex.Matcher;
007    import java.util.regex.Pattern;
008    
009    import javax.mail.internet.AddressException;
010    import javax.mail.internet.InternetAddress;
011    
012    /**
013     * Validate syntax of email addresses. Does not probe to see if mailserver exists in DNS or online.
014     * See MailProber for that. See ValidateEmailFile for an example of how to use this class.
015     * 
016     * Minor reformatting.  Found in:
017     * http://www.velocityreviews.com/forums/
018     *  t128486-re-can-javamail-detect-a-nonexistant-email-address.html
019     * 
020     * @author Roedy Green, Canadian Mind Products
021     * @author Philip Johnson. 
022     * @version 1.0 to do: check validity of & in first part of email address. Appears in practice.
023     */
024    public final class ValidateEmailSyntax {
025      
026      /** Make this class noninstantiable. */
027      private ValidateEmailSyntax() {
028        // Do nothing.
029      }
030      
031      /**
032       * Returns true if the email appears to be valid.  
033       * @param email The email address of interest. 
034       * @return True if it appears to be valid. 
035       */
036      public static boolean isValid(String email) {
037        return howValid(email) >= 2;
038      }
039    
040      /**
041       * Check how likely an email address is to be valid. The higher the number returned, the more
042       * likely the address is valid. This method does not probe the internet in any way to see if the
043       * corresponding mail server or domain exists.
044       * 
045       * @param email bare computer email address. e.g. No "Roedy Green" <> style addresses. No local
046       *        addresses, eg. roedy.
047       * 
048       * @return 0 = email address is definitely malformed, e.g. missing @. ends in .invalid <br>
049       *         1 = address does not meet one of the valid patterns below. It still might be ok
050       *         according to some obscure rule in RFC 822 Java InternetAddress accepts it as valid.
051       *         <br>
052       *         2 = unknown top level domain. <br>
053       *         3 = dots at beginning or end, doubled in name. <br>
054       *         4 = address of form xxx@[209.139.205.2] using IP <br>
055       *         5 = address of form Dots _ or - in first part of name <br>
056       *         6 = addreess of form rare, but known, domain <br>
057       *         7 = address of form or any national suffix. <br>
058       *         8 = address of form the matching this national suffix, e.g. .ca in Canada, .de in
059       *         Germany <br>
060       *         9 = address of form .org .net .edu .gov ..biz -- official domains
061       */
062      private static int howValid(String email) { //NOPMD (reassign email)
063        if (email == null) {
064          return 0;
065        }
066        email = email.trim().toLowerCase();
067        int dotPlace = email.lastIndexOf('.');
068        if (0 < dotPlace && dotPlace < email.length() - 1) {
069          /* have at least x.y */
070          String tld = email.substring(dotPlace + 1);
071          if (badTLDs.contains(tld)) {
072            /* deliberate invalid address */
073            return 0;
074          }
075          // make sure none of fragments start or end in _ or -
076          String[] fragments = splitter.split(email);
077          boolean clean = true;
078          for (int i = 0; i < fragments.length; i++) {
079            if (fragments[i].startsWith("_") || fragments[i].endsWith("_")
080                || fragments[i].startsWith("-") || fragments[i].endsWith("-")) {
081              clean = false;
082              break;
083            }
084          } // end for
085          if (clean) {
086            Matcher m9 = p9.matcher(email);
087            if (m9.matches()) {
088              if (officialTLDs.contains(tld)) {
089                return 9;
090              }
091              else if (thisCountry.equals(tld)) {
092                return 8;
093              }
094              else if (nationalTLDs.contains(tld)) {
095                return 7;
096              }
097              else if (rareTLDs.contains(tld)) { //NOPMD
098                return 6;
099              }
100              else {
101                return 3; /* unknown tld */
102              }
103            }
104            // allow dots in name
105            Matcher m5 = p5.matcher(email);
106            if (m5.matches()) {
107              if (officialTLDs.contains(tld)) {
108                return 5;
109              }
110              else if (thisCountry.equals(tld)) {
111                return 5;
112              }
113              else if (nationalTLDs.contains(tld)) {
114                return 5;
115              }
116              else if (rareTLDs.contains(tld)) {
117                return 5;
118              }
119              else {
120                return 2; /* unknown tld */
121              }
122            }
123    
124            // IP
125            Matcher m4 = p4.matcher(email);
126            if (m4.matches()) {
127              return 4; /* can't tell TLD */ 
128            }
129    
130            // allow even lead/trail dots in name, except at start of domain
131            Matcher m3 = p3.matcher(email);
132            if (m3.matches()) {
133              if (officialTLDs.contains(tld)) {
134                return 3;
135              }
136              else if (thisCountry.equals(tld)) {
137                return 3;
138              }
139              else if (nationalTLDs.contains(tld)) {
140                return 3;
141              }
142              else if (rareTLDs.contains(tld)) {
143                return 3;
144              }
145              else {
146                return 2; /* unknown domain */ 
147              }
148            }
149          } // end if clean
150        }
151        // allow even unclean addresses, and addresses without a TLD to have a whack at passing RFC:822
152        try {
153    
154          /*
155           * see if InternetAddress likes it, it follows RFC:822. It will names without domains though.
156           */
157          InternetAddress.parse(email, true /* strict */);
158          // it liked it, no exception happened. Seems very sloppy.
159          return 1;
160        }
161        catch (AddressException e) {
162          // it did not like it
163          return 0;
164        }
165      }
166      
167     
168    
169      // allow _ - in name, lead and trailing ones are filtered later, no +.
170      private static Pattern p9 = Pattern.compile("[a-z0-9\\-_]++@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++");
171    
172      // to split into fields
173      private static Pattern splitter = Pattern.compile("[@\\.]");
174    
175      // to allow - _ dots in name, no +
176      private static Pattern p5 = Pattern
177          .compile("[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)*@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++");
178    
179      // IP style names, no +
180      private static Pattern p4 = Pattern
181          .compile("[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)*@\\[([0-9]{1,3}\\.){3}[0-9]{1,3}\\]");
182    
183      // allow dots anywhere, but not at start of domain name, no +
184      private static Pattern p3 = Pattern
185          .compile("[a-z0-9\\-_\\.]++@[a-z0-9\\-_]++(\\.[a-z0-9\\-_]++)++");
186    
187      /**
188       * build a HashSet from a array of String literals.
189       * 
190       * @param list array of strings
191       * @return HashSet you can use to test if a string is in the set.
192       */
193      private static Set<String> hmaker(String[] list) {
194        Set<String> map = new HashSet<String>(Math.max((int) (list.length / .75f) + 1, 16));
195        for (int i = 0; i < list.length; i++) {
196          map.add(list[i]);
197        }
198        return map;
199      }
200    
201      private static final String thisCountry = Locale.getDefault().getCountry().toLowerCase();
202    
203      private static final Set<String> officialTLDs = hmaker(new String[] { "aero", "biz", "coop",
204          "com", "edu", "gov", "info", "mil", "museum", "name", "net", "org", "pro", });
205    
206      private static final Set<String> rareTLDs = hmaker(new String[] { "cam", "mp3", "agent",
207          "art", "arts", "asia", "auction", "aus", "bank", "cam", "chat", "church", "club", "corp",
208          "dds", "design", "dns2go", "e", "email", "exp", "fam", "family", "faq", "fed", "film",
209          "firm", "free", "fun", "g", "game", "games", "gay", "ger", "globe", "gmbh", "golf", "gov",
210          "help", "hola", "i", "inc", "int", "jpn", "k12", "kids", "law", "learn", "llb", "llc", "llp",
211          "lnx", "love", "ltd", "mag", "mail", "med", "media", "mp3", "netz", "nic", "nom", "npo",
212          "per", "pol", "prices", "radio", "rsc", "school", "scifi", "sea", "service", "sex", "shop",
213          "sky", "soc", "space", "sport", "tech", "tour", "travel", "usvi", "video", "web", "wine",
214          "wir", "wired", "zine", "zoo", });
215    
216      private static final Set<String> nationalTLDs = hmaker(new String[] { "ac", "ad", "ae", "af",
217          "ag", "ai", "al", "am", "an", "ao", "aq", "ar", "as", "at", "au", "aw", "az", "ba", "bb",
218          "bd", "be", "bf", "bg", "bh", "bi", "bj", "bm", "bn", "bo", "br", "bs", "bt", "bv", "bw",
219          "by", "bz", "ca", "cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "cm", "cn", "co", "cr",
220          "cu", "cv", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do", "dz", "ec", "ee", "eg", "eh",
221          "er", "es", "et", "fi", "fj", "fk", "fm", "fo", "fr", "fx", "ga", "gb", "gd", "ge", "gf",
222          "gg", "gh", "gi", "gl", "gm", "gn", "gp", "gq", "gr", "gs", "gt", "gu", "gw", "gy", "hk",
223          "hm", "hn", "hr", "ht", "hu", "id", "ie", "il", "im", "in", "io", "iq", "ir", "is", "it",
224          "je", "jm", "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "kw", "ky", "kz",
225          "la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma", "mc", "md", "mg",
226          "mh", "mk", "ml", "mm", "mn", "mo", "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx",
227          "my", "mz", "na", "nc", "ne", "nf", "ng", "ni", "nl", "no", "np", "nr", "nu", "nz", "om",
228          "pa", "pe", "pf", "pg", "ph", "pk", "pl", "pm", "pn", "pr", "ps", "pt", "pw", "py", "qa",
229          "re", "ro", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sj", "sk", "sl",
230          "sm", "sn", "so", "sr", "st", "sv", "sy", "sz", "tc", "td", "tf", "tg", "th", "tj", "tk",
231          "tm", "tn", "to", "tp", "tr", "tt", "tv", "tw", "tz", "ua", "ug", "uk", "um", "us", "uy",
232          "uz", "va", "vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yt", "yu", "za", "zm",
233          "zw", });
234    
235      private static final Set<String> badTLDs = hmaker(new String[] { "invalid", "nowhere", 
236          "noone", });
237    }