192 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			Julia
		
	
	
	
	
	
		
		
			
		
	
	
			192 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			Julia
		
	
	
	
	
	
|   | # Following work by @jiahao, we compute character widths using a combination of | |||
|  | #   * advance widths from GNU Unifont (advance width 512 = 1 en) | |||
|  | #   * UAX 11: East Asian Width | |||
|  | #   * a few exceptions as needed | |||
|  | # Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734 | |||
|  | # | |||
|  | # Requires Julia (obviously) and FontForge. | |||
|  | 
 | |||
|  | ############################################################################# | |||
|  | # Julia 0.3/0.4 compatibility (taken from Compat package) | |||
|  | if VERSION < v"0.4.0-dev+1419" | |||
|  |     const UInt16 = Uint16 | |||
|  | end | |||
|  | 
 | |||
|  | CharWidths = Dict{Int,Int}() | |||
|  | 
 | |||
|  | ############################################################################# | |||
|  | # Use ../libutf8proc for category codes, rather than the one in Julia, | |||
|  | # to minimize bootstrapping complexity when a new version of Unicode comes out. | |||
|  | catcode(c) = ccall((:utf8proc_category,"../libutf8proc"), Cint, (Int32,), c) | |||
|  | 
 | |||
|  | # use Base.UTF8proc module to get category codes constants, since | |||
|  | # we won't change these in utf8proc. | |||
|  | import Base.UTF8proc | |||
|  | 
 | |||
|  | ############################################################################# | |||
|  | # Use a default width of 1 for all character categories that are | |||
|  | # letter/symbol/number-like.  This can be overriden by Unifont or UAX 11 | |||
|  | # below, but provides a useful nonzero fallback for new codepoints when | |||
|  | # a new Unicode version has been released but Unifont hasn't been updated yet. | |||
|  | 
 | |||
|  | zerowidth = Set{Int}() # categories that may contain zero-width chars | |||
|  | push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CN) | |||
|  | push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MN) | |||
|  | push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MC) | |||
|  | push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ME) | |||
|  | push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_SK) | |||
|  | push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZS) | |||
|  | push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZL) | |||
|  | push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZP) | |||
|  | push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CC) | |||
|  | push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CF) | |||
|  | push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CS) | |||
|  | push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CO) | |||
|  | for c in 0x0000:0x110000 | |||
|  |     if catcode(c) ∉ zerowidth | |||
|  |         CharWidths[c] = 1 | |||
|  |     end | |||
|  | end | |||
|  | 
 | |||
|  | ############################################################################# | |||
|  | # Widths from GNU Unifont | |||
|  | 
 | |||
|  | universion=get(ENV, "UNIFONT_VERSION", "7.0.06") | |||
|  | for fontfile in ["unifont-$universion", "unifont_upper-$universion"] | |||
|  |     isfile("$fontfile.ttf") || download("http://unifoundry.com/pub/unifont-$universion/font-builds/$fontfile.ttf", "$fontfile.ttf") | |||
|  |     isfile("$fontfile.sfd") || run(`fontforge -lang=ff -c "Open(\"$fontfile.ttf\");Save(\"$fontfile.sfd\");Quit(0);"`) | |||
|  | end | |||
|  | 
 | |||
|  | #Read sfdfile for character widths | |||
|  | function parsesfd(filename::String, CharWidths::Dict{Int,Int}=Dict{Int,Int}()) | |||
|  |     state=:seekchar | |||
|  |     lineno = 0 | |||
|  |     codepoint = width = nothing | |||
|  |     for line in readlines(open(filename)) | |||
|  |         lineno += 1 | |||
|  |         if state==:seekchar         #StartChar: nonmarkingreturn | |||
|  |             if contains(line, "StartChar: ") | |||
|  |                 codepoint = nothing | |||
|  |                 width = nothing | |||
|  |                 state = :readdata | |||
|  |             end | |||
|  |         elseif state==:readdata #Encoding: 65538 -1 2, Width: 1024 | |||
|  |             contains(line, "Encoding:") && (codepoint = int(split(line)[3])) | |||
|  |             contains(line, "Width:") && (width = int(split(line)[2])) | |||
|  |             if codepoint!=nothing && width!=nothing && codepoint >= 0 | |||
|  |                 w=div(width, 512) # 512 units to the en | |||
|  |                 if w > 0 | |||
|  |                     # only add nonzero widths, since (1) the default is zero | |||
|  |                     # and (2) this circumvents some apparent bugs in Unifont | |||
|  |                     # (https://savannah.gnu.org/bugs/index.php?45395) | |||
|  |                     CharWidths[codepoint] = w | |||
|  |                 end | |||
|  |                 state = :seekchar | |||
|  |             end | |||
|  |         end | |||
|  |     end | |||
|  |     CharWidths | |||
|  | end | |||
|  | CharWidths=parsesfd("unifont-$universion.sfd", CharWidths) | |||
|  | CharWidths=parsesfd("unifont_upper-$universion.sfd", CharWidths) | |||
|  | 
 | |||
|  | ############################################################################# | |||
|  | # Widths from UAX #11: East Asian Width | |||
|  | #   .. these take precedence over the Unifont width for all codepoints | |||
|  | #      listed explicitly as wide/full/narrow/half-width | |||
|  | 
 | |||
|  | isfile("EastAsianWidth.txt") || download("http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt", "EastAsianWidth.txt") | |||
|  | for line in readlines(open("EastAsianWidth.txt")) | |||
|  |     #Strip comments | |||
|  |     line[1] == '#' && continue | |||
|  |     precomment = split(line, '#')[1] | |||
|  |     #Parse code point range and width code | |||
|  |     tokens = split(precomment, ';') | |||
|  |     length(tokens) >= 2 || continue | |||
|  |     charrange = tokens[1] | |||
|  |     width = strip(tokens[2]) | |||
|  |     #Parse code point range into Julia UnitRange | |||
|  |     rangetokens = split(charrange, "..") | |||
|  |     charstart = uint32("0x"*rangetokens[1]) | |||
|  |     charend = uint32("0x"*rangetokens[length(rangetokens)>1 ? 2 : 1]) | |||
|  | 
 | |||
|  |     #Assign widths | |||
|  |     for c in charstart:charend | |||
|  |         if width=="W" || width=="F" # wide or full | |||
|  |             CharWidths[c]=2 | |||
|  |         elseif width=="Na"|| width=="H" # narrow or half | |||
|  |             CharWidths[c]=1 | |||
|  |         end | |||
|  |     end | |||
|  | end | |||
|  | 
 | |||
|  | ############################################################################# | |||
|  | # A few exceptions to the above cases, found by manual comparison | |||
|  | # to other wcwidth functions and similar checks. | |||
|  | 
 | |||
|  | for c in keys(CharWidths) | |||
|  |     cat = catcode(c) | |||
|  | 
 | |||
|  |     # make sure format control character (category Cf) have width 0, | |||
|  |     # except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2) | |||
|  |     if cat==UTF8proc.UTF8PROC_CATEGORY_CF && c ∉ [0x0601,0x0602,0x0603,0x06dd] | |||
|  |         CharWidths[c]=0 | |||
|  |     end | |||
|  | 
 | |||
|  |     # Unifont has nonzero width for a number of non-spacing combining | |||
|  |     # characters, e.g. (in 7.0.06): f84,17b4,17b5,180b,180d,2d7f, and | |||
|  |     # the variation selectors | |||
|  |     if cat==UTF8proc.UTF8PROC_CATEGORY_MN | |||
|  |         CharWidths[c]=0 | |||
|  |     end | |||
|  | 
 | |||
|  |     # We also assign width of zero to unassigned and private-use | |||
|  |     # codepoints (Unifont includes ConScript Unicode Registry PUA fonts, | |||
|  |     # but since these are nonstandard it seems questionable to recognize them). | |||
|  |     if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN | |||
|  |         CharWidths[c]=0 | |||
|  |     end | |||
|  | 
 | |||
|  |     # for some reason, Unifont has width-2 glyphs for ASCII control chars | |||
|  |     if cat==UTF8proc.UTF8PROC_CATEGORY_CC | |||
|  |         CharWidths[c]=0 | |||
|  |     end | |||
|  | end | |||
|  | 
 | |||
|  | #By definition, should have zero width (on the same line) | |||
|  | #0x002028 '
' category: Zl name: LINE SEPARATOR/ | |||
|  | #0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/ | |||
|  | CharWidths[0x2028]=0 | |||
|  | CharWidths[0x2029]=0 | |||
|  | 
 | |||
|  | #By definition, should be narrow = width of 1 en space | |||
|  | #0x00202f ' ' category: Zs name: NARROW NO-BREAK SPACE/ | |||
|  | CharWidths[0x202f]=1 | |||
|  | 
 | |||
|  | #By definition, should be wide = width of 1 em space | |||
|  | #0x002001 ' ' category: Zs name: EM QUAD/ | |||
|  | #0x002003 ' ' category: Zs name: EM SPACE/ | |||
|  | CharWidths[0x2001]=2 | |||
|  | CharWidths[0x2003]=2 | |||
|  | 
 | |||
|  | ############################################################################# | |||
|  | # Output (to a file or pipe) for processing by data_generator.rb | |||
|  | # ... don't bother to output zero widths since that will be the default. | |||
|  | 
 | |||
|  | firstc = 0x000000 | |||
|  | lastv = 0 | |||
|  | uhex(c) = uppercase(hex(c,4)) | |||
|  | for c in 0x0000:0x110000 | |||
|  |     v = get(CharWidths, c, 0) | |||
|  |     if v != lastv || c == 0x110000 | |||
|  |         v < 4 || error("invalid charwidth $v for $c") | |||
|  |         if firstc+1 < c | |||
|  |             println(uhex(firstc), "..", uhex(c-1), "; ", lastv) | |||
|  |         else | |||
|  |             println(uhex(firstc), "; ", lastv) | |||
|  |         end | |||
|  |         firstc = c | |||
|  |         lastv = v | |||
|  |     end | |||
|  | end |