[swift-users] Zero cost abstraction 2D Iterator (equivalent to two nested for loops) impossible?

Mon Jan 9 23:56:50 CST 2017

Perhaps the optimizer unrolls the inner loop, and thus can skip safety checks. Naively, seems trickier to do for the iterator. 

> El ene. 4, 2017, a las 9:10 PM, Jens Persson via swift-users <swift-users at swift.org> escribió:
> 
> I noticed disabling safety checks made the custom Iterator as fast as the two nested for-loops.
> Karl, how does your test change when disabling safety checks?
> 
> Anyone having an idea why disabling safety checks should make the two sum funcs equally fast in my example program?
> 
> (It shouldn't be the preconditions of Table2D<>'s subscripts, unless the optimizer (with safety checks) can remove them in the case of the nested for-loops but not in the case of the custom iterator.) 
> 
> /Jens
> 
> 
>> On Wed, Jan 4, 2017 at 6:59 PM, Jens Persson <jens at bitcycle.com> wrote:
>> Thanks, I wonder if it is currently impossible to make it as fast as the nested for loops, ie that some optimizer improvement could fix it.
>> Here's a stripped down version of my code:
>> 
>> 
>> import QuartzCore // This is just for timing using CACurrentMediaTime()
>> 
>> 
>> struct Point2DInt {
>>     var x: Int
>>     var y: Int
>> }
>> 
>> struct Size2DInt {
>>     var width: Int
>>     var height: Int
>>     var rect: Rect2DInt { return Rect2DInt(x: 0, y: 0, width: width, height: height) }
>> }
>> 
>> struct Rect2DInt : Sequence {
>>     var description: String { return "(\(origin), \(size))" }
>>     var origin: Point2DInt
>>     var size: Size2DInt
>>     var minX: Int {
>>         get { return origin.x }
>>         set { size.width = maxX - newValue; origin.x = newValue }
>>     }
>>     var maxX: Int {
>>         get { return origin.x + size.width }
>>         set { size.width = newValue - minX }
>>     }
>>     var minY: Int {
>>         get { return origin.y }
>>         set { size.height = maxY - newValue; origin.y = newValue }
>>     }
>>     var maxY: Int {
>>         get { return origin.y + size.height }
>>         set { size.height = newValue - minY }
>>     }
>>     init(origin: Point2DInt, size: Size2DInt) {
>>         self.origin = origin
>>         self.size = size
>>     }
>>     init(x: Int, y: Int, width: Int, height: Int) {
>>         self.init(origin: Point2DInt(x: x, y: y), size: Size2DInt(width: width, height: height))
>>     }
>>     init(minX: Int, minY: Int, maxX: Int, maxY: Int) {
>>         self.init(origin: Point2DInt(x: minX, y: minY),
>>                   size: Size2DInt(width: maxX - minX, height: maxY - minY))
>>     }
>>     func makeIterator() -> Rect2DIntPointIterator {
>>         return Rect2DIntPointIterator(rect: self)
>>     }
>> }
>> 
>> 
>> // This is the crucial type here, this version is the fastest that
>> // I could find, but it's still slower than two nested for loops,
>> // see test below.
>> struct Rect2DIntPointIterator : IteratorProtocol, Sequence {
>>     let startX, startY, stopX, stopY: Int
>>     var currentPoint: Point2DInt
>>     init(rect: Rect2DInt) {
>>         currentPoint = rect.origin
>>         startX = rect.origin.x
>>         startY = rect.origin.y
>>         stopX = rect.maxX
>>         stopY = rect.maxY
>>     }
>>     mutating func next() -> Point2DInt? {
>>         defer { currentPoint.x = currentPoint.x &+ 1 }
>>         if currentPoint.x == stopX {
>>             currentPoint.x = startX
>>             currentPoint.y = currentPoint.y &+ 1
>>             if currentPoint.y == stopY { return nil }
>>         }
>>         return currentPoint
>>     }
>> }
>> 
>> 
>> struct Table2D<Element> {
>>     let size: Size2DInt
>>     var storage: [Element]
>>     init(size: Size2DInt, filledWith element: Element) {
>>         precondition(size.width > 0 && size.height > 0)
>>         self.size = size
>>         self.storage = [Element](repeating: element, count: size.width * size.height)
>>     }
>>     subscript(x: Int, y: Int) -> Element {
>>         get {
>>             precondition(x >= 0 && y >= 0 && x < size.width && y < size.height)
>>             return storage[x + y * size.width]
>>         }
>>         set {
>>             precondition(x >= 0 && y >= 0 && x < size.width && y < size.height)
>>             storage[x + y * size.width] = newValue
>>         }
>>     }
>>     subscript(position: Point2DInt) -> Element {
>>         get { return self[position.x, position.y] }
>>         set { self[position.x, position.y] = newValue }
>>     }
>> }
>> 
>> func randomDouble() -> Double {
>>     // Returns a random Double in the range [0, 1)
>>     let ui64 = (UInt64(arc4random()) << 32) | UInt64(arc4random())
>>     return Double(ui64 >> UInt64(63 - Double.significandBitCount)) * .ulpOfOne/2
>> }
>> func randomInt(from: Int, to: Int) -> Int {
>>     // Returns an Int in the range [from, to)
>>     return Int(Double(from) + (randomDouble() * Double(to - from)).rounded(.down))
>> }
>> 
>> func randomSubrects(of rect: Rect2DInt, minSize: Size2DInt, count: Int) -> [Rect2DInt] {
>>     precondition(count > 0 && minSize.width > 0 && minSize.height > 0)
>>     var subrects = [Rect2DInt]()
>>     subrects.reserveCapacity(count)
>>     for _ in 0 ..< count {
>>         let size = Size2DInt(width: randomInt(from: minSize.width, to: rect.size.width),
>>                              height: randomInt(from: minSize.height, to: rect.size.height))
>>         let origin = Point2DInt(x: randomInt(from: 0, to: rect.size.width - size.width),
>>                                 y: randomInt(from: 0, to: rect.size.height - size.height))
>>         subrects.append(Rect2DInt(origin: origin, size: size))
>>     }
>>     return subrects
>> }
>> 
>> 
>> func randomTable(size: Size2DInt) -> Table2D<Double> {
>>     var table = Table2D(size: size, filledWith: 0.0)
>>     for p in table.size.rect { table[p] = randomDouble() }
>>     return table
>> }
>> 
>> 
>> func sum1(areas: [Rect2DInt], of table: Table2D<Double>) -> Double {
>>     var sum = 0.0
>>     for r in areas {
>>         // Using custom iterator:
>>         for p in r { sum += table[p] }
>>     }
>>     return sum
>> }
>> 
>> func sum2(areas: [Rect2DInt], of table: Table2D<Double>) -> Double {
>>     var sum = 0.0
>>     for r in areas {
>>         // Using two nested for loops:
>>         for y in r.minY ..< r.maxY {
>>             for x in r.minX ..< r.maxX {
>>                 sum = sum + table[x, y]
>>             }
>>         }
>>     }
>>     return sum
>> }
>> 
>> func test(
>>     sumFn: ([Rect2DInt], Table2D<Double>) -> Double,
>>     label: String,
>>     table: Table2D<Double>,
>>     areas: [Rect2DInt]
>>     )
>> {
>>     let t0 = CACurrentMediaTime()
>>     let sum = sumFn(areas, table)
>>     let t1 = CACurrentMediaTime()
>>     print(label, t1 - t0, "seconds (sum \(sum))")
>> }
>> 
>> for _ in 0 ..< 4 {
>>     let rndTable = randomTable(size: Size2DInt(width: 1000, height: 1000))
>>     let rndTableAreas = randomSubrects(of: rndTable.size.rect,
>>                                        minSize: Size2DInt(width: 100, height: 100),
>>                                        count: 1000)
>>     test(sumFn: sum1, label: "sum1 - using custom iterator  ", table: rndTable, areas: rndTableAreas)
>>     test(sumFn: sum2, label: "sum2 - using nested for-loops ", table: rndTable, areas: rndTableAreas)
>>     print()
>> }
>> 
>> //
>> // Typical output on my MacBook Pro (Retina, 15-inch, Late 2013):
>> //
>> // sum1 - using custom iterator   0.480134483019356 seconds (sum 153408603.850653)
>> // sum2 - using nested for-loops  0.348341046017595 seconds (sum 153408603.850653)
>> // 
>> // sum1 - using custom iterator   0.426998238021042 seconds (sum 149851816.622638)
>> // sum2 - using nested for-loops  0.34111139801098 seconds (sum 149851816.622638)
>> // 
>> // sum1 - using custom iterator   0.466021075990284 seconds (sum 155267702.297466)
>> // sum2 - using nested for-loops  0.351970263000112 seconds (sum 155267702.297466)
>> // 
>> // sum1 - using custom iterator   0.426723245007452 seconds (sum 146331850.202214)
>> // sum2 - using nested for-loops  0.340267747989856 seconds (sum 146331850.202214)
>> // 
>> 
>> 
>>> On Wed, Jan 4, 2017 at 12:42 PM, Karl <razielim at gmail.com> wrote:
>>> Hmmm that’s interesting. A brief test I ran:
>>> 
>>> import CoreGraphics
>>> import Foundation
>>> 
>>> struct PointIterator {
>>>   let rect: CGRect
>>>   var nextPoint: CGPoint
>>> 
>>>   let maxX: CGFloat
>>>   let maxY: CGFloat
>>> 
>>>   init(rect: CGRect) {
>>>     self.rect      = rect.standardized
>>>     self.nextPoint = self.rect.origin
>>>     // Cache for fast iteration
>>>     self.maxX      = self.rect.maxX
>>>     self.maxY      = self.rect.maxY
>>>   }
>>> 
>>>   mutating func next() -> CGPoint? {
>>>     guard nextPoint.x <= maxX, nextPoint.y <= maxY else {
>>>       return .none
>>>     }
>>>     defer {
>>>       nextPoint.x += 1
>>>       if nextPoint.x > maxX {
>>>         nextPoint.x  = rect.origin.x
>>>         nextPoint.y += 1
>>>       }
>>>     }
>>>     return nextPoint
>>>   }
>>> }
>>> 
>>> // Use iterator
>>> func iteratePoints_it(_ rect: CGRect, with: (CGPoint)->()) {
>>>   var it = PointIterator(rect: rect)
>>>   while let point = it.next() {
>>>     with(point)
>>>   }
>>> }
>>> 
>>> // Basic unwrapping of the iterator as a function (no ‘defer’)
>>> func iteratePoints_fe(_ rect: CGRect, with: (CGPoint)->()) {
>>>   let rect      = rect.standardized
>>>   var nextPoint = rect.origin
>>>   let maxX      = rect.maxX
>>>   let maxY      = rect.maxY
>>>   
>>>   while true {
>>>     guard nextPoint.x <= maxX, nextPoint.y <= maxY else {
>>>       return
>>>     }
>>>     with(nextPoint) 
>>>     nextPoint.x += 1
>>>     if nextPoint.x > maxX {
>>>       nextPoint.x  = rect.origin.x
>>>       nextPoint.y += 1
>>>     }
>>>   } 
>>> }
>>> 
>>> // for..in loop
>>> func iteratePoints_fe2(_ rect: CGRect, with: (CGPoint)->()) {
>>>   let rect = rect.standardized
>>>   let maxX = rect.maxX
>>>   let maxY = rect.maxY
>>>   for y in stride(from: rect.origin.y, to: maxY, by: 1) {
>>>     for x in stride(from: rect.origin.x, to: maxX, by: 1) {
>>>       with(CGPoint(x: x, y: y))
>>>     }
>>>   }
>>> }
>>> 
>>> func profile(_ iterations: Int, _ thing: ()->()) -> TimeInterval {
>>>   var totalTime: TimeInterval = 0
>>>   for _ in 0..<iterations {
>>>     let start = Date().timeIntervalSinceReferenceDate
>>>     thing()
>>>     totalTime += (Date().timeIntervalSinceReferenceDate - start)
>>>   }
>>>   return totalTime/TimeInterval(iterations)
>>> }
>>> 
>>> 
>>> let bigRect = CGRect(x: 0, y: 0, width: 10_000, height: 10_000)
>>> 
>>> let iterator = profile(10) { iteratePoints_it(bigRect)  { if $0.x > 1_000_000 { print("?") } } } // always false, won't be optimised out.
>>> let foreach = profile(10)  { iteratePoints_fe(bigRect)  { if $0.x > 1_000_000 { print("?") } } }
>>> let foreach2 = profile(10) { iteratePoints_fe2(bigRect) { if $0.x > 1_000_000 { print("?") } } }
>>> print("iterator: \(iterator) \nforeach: \(foreach) \nforeach2: \(foreach2)")
>>> 
>>> Results:
>>> 
>>> iterator: 0.316907703876495 
>>> foreach: 0.283202117681503 
>>> foreach2: 0.568318998813629
>>> 
>>> That ranking is consistent, too. Using an iterator does appear marginally slower than a basic unwrapping of the iterator in to a function.
>>> 
>>>> On 4 Jan 2017, at 09:56, Jens Persson via swift-users <swift-users at swift.org> wrote:
>>>> 
>>>> Hi,
>>>> 
>>>> I'm working on some low-level pixel processing code (stuff that is not possible to do using standard API or on eg GPU), and I had lots of eg these:
>>>> 
>>>> for y in someStartY ..< someStopY {
>>>>     for x in someStartX ..< someStopX {
>>>>         ... pixels[x, y] ...
>>>>     }
>>>> }
>>>> 
>>>> So I implemented some (value) types like eg IntPoint2D, IntSize2D, IntRect2D and I made an IntRect2DIterator so that IntRect2D could be a Sequence over its (discrete) points. With this I could rewrite the above like so:
>>>> 
>>>> for pixelPosAsIntPoint2D in someIntRect2D {
>>>>     ... pixels[pixelPosAsIntPoint2D] ...
>>>> }
>>>> 
>>>> For some reason the first version (two nested for loops for x and y) is always a bit faster than the abstracted version no matter how I write it (tried using eg &+ instead of + etc).
>>>> 
>>>> 
>>>> 
>>>> Is it possible to write as a zero cost abstraction like this, if so, how? If not, why?
>>>> 
>>>> 
>>>> 
>>>> 
>>>> PS
>>>> 
>>>> Note that eg this:
>>>> 
>>>> for y in someStartY ..< someStopY {
>>>>     for x in someStartX ..< someStopX {
>>>>        let pixelPosAsIntPoint2D = IntPoint2D(x: x, y: y)
>>>>         ... pixels[pixelPosAsIntPoint2D] ...
>>>>     }
>>>> }
>>>> 
>>>> is exactly as fast as the top example (using just ... pixels[x, y] ...). So the difference in execution time seems to be due to something in the Iterator and not eg the pixel accessing subscript taking the 2d int point type instead of separate x and y ints.
>>>> 
>>>> Here is one Iterator variant that I have tested:
>>>> 
>>>> struct Rect2DIntPointIterator : IteratorProtocol, Sequence {
>>>>     let startX, startY, stopX, stopY: Int
>>>>     var px, py: Int
>>>>     init(rect: Rect2DInt) {
>>>>         startX = rect.origin.x
>>>>         startY = rect.origin.y
>>>>         stopX = rect.maxX
>>>>         stopY = rect.maxY
>>>>         px = startX
>>>>         py = startY
>>>>     }
>>>>     mutating func next() -> Point2DInt? {
>>>>         defer { px = px &+ 1 }
>>>>         if px == stopX {
>>>>             px = startX
>>>>             py = py &+ 1
>>>>             if py == stopY { return nil }
>>>>         }
>>>>         return Point2DInt(x: px, y: py)
>>>>     }
>>>> }
>>>> 
>>>> 
>>>> And here are typical execution times from an example test:
>>>> 2.1 seconds using my Iterator (the fastest I can get it, no matter how I try to rewrite it).
>>>> 1.5 seconds using nested x, y for loops.
>>>> 
>>>> I'm pretty sure my testing is done thoroughly (measuring average of many runs, using random data, avoiding dead code elimination, whole module optimization, etc).
>>>> 
>>>> I have tried profiling the code and looking at the disassmbly but I'm failing to understand what's going on.
>>>> 
>>>> So the ultimate answer would be in the form of a (2d, Int) Rectangle type whose (2d, Int) Points can be iterated in a for loop, at zero cost compared to doing the same using two nested for loops. Or an explanation of why this is impossible.
>>>> 
>>>> DS
>>>> 
>>>> /Jens
>>>> 
>>>> _______________________________________________
>>>> swift-users mailing list
>>>> swift-users at swift.org
>>>> https://lists.swift.org/mailman/listinfo/swift-users
>>> 
>> 
> 
> _______________________________________________
> swift-users mailing list
> swift-users at swift.org
> https://lists.swift.org/mailman/listinfo/swift-users
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.swift.org/pipermail/swift-users/attachments/20170110/f38ef40e/attachment.html>